diff options
Diffstat (limited to 'fs/btrfs')
53 files changed, 3398 insertions, 1576 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index a26c63b4ad68..2e558227931a 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -91,3 +91,14 @@ config BTRFS_ASSERT any of the assertions trip. This is meant for btrfs developers only. If unsure, say N. + +config BTRFS_FS_REF_VERIFY + bool "Btrfs with the ref verify tool compiled in" + depends on BTRFS_FS + default n + help + Enable run-time extent reference verification instrumentation. This + is meant to be used by btrfs developers for tracking down extent + reference problems or verifying they didn't break something. + + If unsure, say N. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 962a95aefb81..6fe881d5cb38 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_BTRFS_FS) := btrfs.o @@ -9,10 +10,11 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o hash.o free-space-tree.o + uuid-tree.o props.o hash.o free-space-tree.o tree-checker.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o +btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index e00c8a9fd5bb..d5540749f0e5 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -67,7 +67,7 @@ struct btrfs_workqueue { static void normal_work_helper(struct btrfs_work *work); #define BTRFS_WORK_HELPER(name) \ -void btrfs_##name(struct work_struct *arg) \ +noinline_for_stack void btrfs_##name(struct work_struct *arg) \ { \ struct btrfs_work *work = container_of(arg, struct btrfs_work, \ normal_work); \ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b517ef1477ea..7d0dc100a09a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -40,12 +40,14 @@ static int check_extent_in_eb(const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, u64 extent_item_pos, - struct extent_inode_elem **eie) + struct extent_inode_elem **eie, + bool ignore_offset) { u64 offset = 0; struct extent_inode_elem *e; - if (!btrfs_file_extent_compression(eb, fi) && + if (!ignore_offset && + !btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; @@ -84,7 +86,8 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) static int find_extent_in_eb(const struct extent_buffer *eb, u64 wanted_disk_byte, u64 extent_item_pos, - struct extent_inode_elem **eie) + struct extent_inode_elem **eie, + bool ignore_offset) { u64 disk_byte; struct btrfs_key key; @@ -113,7 +116,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb, if (disk_byte != wanted_disk_byte) continue; - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset); if (ret < 0) return ret; } @@ -419,7 +422,7 @@ static int add_indirect_ref(const struct btrfs_fs_info *fs_info, static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct prelim_ref *ref, int level, u64 time_seq, const u64 *extent_item_pos, - u64 total_refs) + u64 total_refs, bool ignore_offset) { int ret = 0; int slot; @@ -472,7 +475,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, *extent_item_pos, - &eie); + &eie, ignore_offset); if (ret < 0) break; } @@ -510,7 +513,8 @@ next: static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, u64 total_refs) + const u64 *extent_item_pos, u64 total_refs, + bool ignore_offset) { struct btrfs_root *root; struct btrfs_key root_key; @@ -581,7 +585,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, } ret = add_all_parents(root, path, parents, ref, level, time_seq, - extent_item_pos, total_refs); + extent_item_pos, total_refs, ignore_offset); out: path->lowest_level = 0; btrfs_release_path(path); @@ -616,7 +620,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, const u64 *extent_item_pos, u64 total_refs, - struct share_check *sc) + struct share_check *sc, bool ignore_offset) { int err; int ret = 0; @@ -661,7 +665,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, } err = resolve_indirect_ref(fs_info, path, time_seq, ref, parents, extent_item_pos, - total_refs); + total_refs, ignore_offset); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -769,6 +773,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, struct btrfs_key key; struct btrfs_key tmp_op_key; struct btrfs_key *op_key = NULL; + struct rb_node *n; int count; int ret = 0; @@ -778,7 +783,9 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, } spin_lock(&head->lock); - list_for_each_entry(node, &head->ref_list, list) { + for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) { + node = rb_entry(n, struct btrfs_delayed_ref_node, + ref_node); if (node->seq > seq) continue; @@ -1107,13 +1114,17 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, * * Otherwise this returns 0 for success and <0 for an error. * + * If ignore_offset is set to false, only extent refs whose offsets match + * extent_item_pos are returned. If true, every extent ref is returned + * and extent_item_pos is ignored. + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist *refs, struct ulist *roots, const u64 *extent_item_pos, - struct share_check *sc) + struct share_check *sc, bool ignore_offset) { struct btrfs_key key; struct btrfs_path *path; @@ -1178,7 +1189,7 @@ again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -1189,7 +1200,7 @@ again: */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); goto again; } spin_unlock(&delayed_refs->lock); @@ -1235,7 +1246,7 @@ again: WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, total_refs, sc); + extent_item_pos, total_refs, sc, ignore_offset); if (ret) goto out; @@ -1282,7 +1293,7 @@ again: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); ret = find_extent_in_eb(eb, bytenr, - *extent_item_pos, &eie); + *extent_item_pos, &eie, ignore_offset); btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) @@ -1350,7 +1361,7 @@ static void free_leaf_list(struct ulist *blocks) static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos) + const u64 *extent_item_pos, bool ignore_offset) { int ret; @@ -1359,7 +1370,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, NULL); + *leafs, NULL, extent_item_pos, NULL, ignore_offset); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1383,7 +1394,8 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) + u64 time_seq, struct ulist **roots, + bool ignore_offset) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1402,7 +1414,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, NULL, NULL); + tmp, *roots, NULL, NULL, ignore_offset); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1421,14 +1433,15 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) + u64 time_seq, struct ulist **roots, + bool ignore_offset) { int ret; if (!trans) down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots); + time_seq, roots, ignore_offset); if (!trans) up_read(&fs_info->commit_root_sem); return ret; @@ -1483,7 +1496,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, - roots, NULL, &shared); + roots, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; @@ -1877,7 +1890,8 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_item_pos, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx) + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset) { int ret; struct btrfs_trans_handle *trans = NULL; @@ -1903,14 +1917,15 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, tree_mod_seq_elem.seq, &refs, - &extent_item_pos); + &extent_item_pos, ignore_offset); if (ret) goto out; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots); + tree_mod_seq_elem.seq, &roots, + ignore_offset); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -1943,7 +1958,8 @@ out: int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx) + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset) { int ret; u64 extent_item_pos; @@ -1961,7 +1977,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - iterate, ctx); + iterate, ctx, ignore_offset); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e410335841aa..0c2fab8514ff 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -43,17 +43,19 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx); + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx); + iterate_extent_inodes_t *iterate, void *ctx, + bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots); + u64 time_seq, struct ulist **roots, bool ignore_offset); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, struct extent_buffer *eb_in, u64 parent, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index eccadb5f62a5..63f0ccc92a71 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -36,14 +36,13 @@ #define BTRFS_INODE_ORPHAN_META_RESERVED 1 #define BTRFS_INODE_DUMMY 2 #define BTRFS_INODE_IN_DEFRAG 3 -#define BTRFS_INODE_DELALLOC_META_RESERVED 4 -#define BTRFS_INODE_HAS_ORPHAN_ITEM 5 -#define BTRFS_INODE_HAS_ASYNC_EXTENT 6 -#define BTRFS_INODE_NEEDS_FULL_SYNC 7 -#define BTRFS_INODE_COPY_EVERYTHING 8 -#define BTRFS_INODE_IN_DELALLOC_LIST 9 -#define BTRFS_INODE_READDIO_NEED_LOCK 10 -#define BTRFS_INODE_HAS_PROPS 11 +#define BTRFS_INODE_HAS_ORPHAN_ITEM 4 +#define BTRFS_INODE_HAS_ASYNC_EXTENT 5 +#define BTRFS_INODE_NEEDS_FULL_SYNC 6 +#define BTRFS_INODE_COPY_EVERYTHING 7 +#define BTRFS_INODE_IN_DELALLOC_LIST 8 +#define BTRFS_INODE_READDIO_NEED_LOCK 9 +#define BTRFS_INODE_HAS_PROPS 10 /* in memory btrfs inode */ struct btrfs_inode { @@ -176,7 +175,8 @@ struct btrfs_inode { * of extent items we've reserved metadata for. */ unsigned outstanding_extents; - unsigned reserved_extents; + + struct btrfs_block_rsv block_rsv; /* * Cached values of inode properties @@ -267,6 +267,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) return false; } +static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, + int mod) +{ + lockdep_assert_held(&inode->lock); + inode->outstanding_extents += mod; + if (btrfs_is_free_space_inode(inode)) + return; + trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode), + mod); +} + static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { int ret = 0; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7d5a9b51f0d7..7d51b5a5b505 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -613,7 +613,7 @@ static void btrfsic_dev_state_hashtable_add( struct btrfsic_dev_state_hashtable *h) { const unsigned int hashval = - (((unsigned int)((uintptr_t)ds->bdev)) & + (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); list_add(&ds->collision_resolving_node, h->table + hashval); @@ -2803,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio_dev(bio)); + dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2913,7 +2913,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, state = kvzalloc(sizeof(*state), GFP_KERNEL); if (!state) { pr_info("btrfs check-integrity: allocation failed!\n"); - return -1; + return -ENOMEM; } if (!btrfsic_is_initialized) { @@ -2945,7 +2945,7 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, if (NULL == ds) { pr_info("btrfs check-integrity: kmalloc() failed!\n"); mutex_unlock(&btrfsic_mutex); - return -1; + return -ENOMEM; } ds->bdev = device->bdev; ds->state = state; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b51d23f5cafa..b35ce16b3df3 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -33,6 +33,8 @@ #include <linux/bit_spinlock.h> #include <linux/slab.h> #include <linux/sched/mm.h> +#include <linux/sort.h> +#include <linux/log2.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -107,7 +109,8 @@ static void end_compressed_bio_read(struct bio *bio) struct inode *inode; struct page *page; unsigned long index; - int ret; + unsigned int mirror = btrfs_io_bio(bio)->mirror_num; + int ret = 0; if (bio->bi_status) cb->errors = 1; @@ -118,6 +121,21 @@ static void end_compressed_bio_read(struct bio *bio) if (!refcount_dec_and_test(&cb->pending_bios)) goto out; + /* + * Record the correct mirror_num in cb->orig_bio so that + * read-repair can work properly. + */ + ASSERT(btrfs_io_bio(cb->orig_bio)); + btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; + cb->mirror_num = mirror; + + /* + * Some IO in this cb have failed, just skip checksum as there + * is no way it could be correct. + */ + if (cb->errors == 1) + goto csum_failed; + inode = cb->inode; ret = check_compressed_csum(BTRFS_I(inode), cb, (u64)bio->bi_iter.bi_sector << 9); @@ -239,7 +257,8 @@ static void end_compressed_bio_write(struct bio *bio) cb->start, cb->start + cb->len - 1, NULL, - bio->bi_status ? 0 : 1); + bio->bi_status ? + BLK_STS_OK : BLK_STS_NOTSUPP); cb->compressed_pages[0]->mapping = NULL; end_compressed_writeback(inode, cb); @@ -690,7 +709,86 @@ out: return ret; } -static struct { +/* + * Heuristic uses systematic sampling to collect data from the input data + * range, the logic can be tuned by the following constants: + * + * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample + * @SAMPLING_INTERVAL - range from which the sampled data can be collected + */ +#define SAMPLING_READ_SIZE (16) +#define SAMPLING_INTERVAL (256) + +/* + * For statistical analysis of the input data we consider bytes that form a + * Galois Field of 256 objects. Each object has an attribute count, ie. how + * many times the object appeared in the sample. + */ +#define BUCKET_SIZE (256) + +/* + * The size of the sample is based on a statistical sampling rule of thumb. + * The common way is to perform sampling tests as long as the number of + * elements in each cell is at least 5. + * + * Instead of 5, we choose 32 to obtain more accurate results. + * If the data contain the maximum number of symbols, which is 256, we obtain a + * sample size bound by 8192. + * + * For a sample of at most 8KB of data per data range: 16 consecutive bytes + * from up to 512 locations. + */ +#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ + SAMPLING_READ_SIZE / SAMPLING_INTERVAL) + +struct bucket_item { + u32 count; +}; + +struct heuristic_ws { + /* Partial copy of input data */ + u8 *sample; + u32 sample_size; + /* Buckets store counters for each byte value */ + struct bucket_item *bucket; + struct list_head list; +}; + +static void free_heuristic_ws(struct list_head *ws) +{ + struct heuristic_ws *workspace; + + workspace = list_entry(ws, struct heuristic_ws, list); + + kvfree(workspace->sample); + kfree(workspace->bucket); + kfree(workspace); +} + +static struct list_head *alloc_heuristic_ws(void) +{ + struct heuristic_ws *ws; + + ws = kzalloc(sizeof(*ws), GFP_KERNEL); + if (!ws) + return ERR_PTR(-ENOMEM); + + ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); + if (!ws->sample) + goto fail; + + ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); + if (!ws->bucket) + goto fail; + + INIT_LIST_HEAD(&ws->list); + return &ws->list; +fail: + free_heuristic_ws(&ws->list); + return ERR_PTR(-ENOMEM); +} + +struct workspaces_list { struct list_head idle_ws; spinlock_t ws_lock; /* Number of free workspaces */ @@ -699,7 +797,11 @@ static struct { atomic_t total_ws; /* Waiters for a free workspace */ wait_queue_head_t ws_wait; -} btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; +}; + +static struct workspaces_list btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; + +static struct workspaces_list btrfs_heuristic_ws; static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, @@ -709,11 +811,25 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { void __init btrfs_init_compress(void) { + struct list_head *workspace; int i; - for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - struct list_head *workspace; + INIT_LIST_HEAD(&btrfs_heuristic_ws.idle_ws); + spin_lock_init(&btrfs_heuristic_ws.ws_lock); + atomic_set(&btrfs_heuristic_ws.total_ws, 0); + init_waitqueue_head(&btrfs_heuristic_ws.ws_wait); + + workspace = alloc_heuristic_ws(); + if (IS_ERR(workspace)) { + pr_warn( + "BTRFS: cannot preallocate heuristic workspace, will try later\n"); + } else { + atomic_set(&btrfs_heuristic_ws.total_ws, 1); + btrfs_heuristic_ws.free_ws = 1; + list_add(workspace, &btrfs_heuristic_ws.idle_ws); + } + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); atomic_set(&btrfs_comp_ws[i].total_ws, 0); @@ -740,18 +856,32 @@ void __init btrfs_init_compress(void) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -static struct list_head *find_workspace(int type) +static struct list_head *__find_workspace(int type, bool heuristic) { struct list_head *workspace; int cpus = num_online_cpus(); int idx = type - 1; unsigned nofs_flag; + struct list_head *idle_ws; + spinlock_t *ws_lock; + atomic_t *total_ws; + wait_queue_head_t *ws_wait; + int *free_ws; + + if (heuristic) { + idle_ws = &btrfs_heuristic_ws.idle_ws; + ws_lock = &btrfs_heuristic_ws.ws_lock; + total_ws = &btrfs_heuristic_ws.total_ws; + ws_wait = &btrfs_heuristic_ws.ws_wait; + free_ws = &btrfs_heuristic_ws.free_ws; + } else { + idle_ws = &btrfs_comp_ws[idx].idle_ws; + ws_lock = &btrfs_comp_ws[idx].ws_lock; + total_ws = &btrfs_comp_ws[idx].total_ws; + ws_wait = &btrfs_comp_ws[idx].ws_wait; + free_ws = &btrfs_comp_ws[idx].free_ws; + } - struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; - spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; - wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *free_ws = &btrfs_comp_ws[idx].free_ws; again: spin_lock(ws_lock); if (!list_empty(idle_ws)) { @@ -781,7 +911,10 @@ again: * context of btrfs_compress_bio/btrfs_compress_pages */ nofs_flag = memalloc_nofs_save(); - workspace = btrfs_compress_op[idx]->alloc_workspace(); + if (heuristic) + workspace = alloc_heuristic_ws(); + else + workspace = btrfs_compress_op[idx]->alloc_workspace(); memalloc_nofs_restore(nofs_flag); if (IS_ERR(workspace)) { @@ -812,18 +945,38 @@ again: return workspace; } +static struct list_head *find_workspace(int type) +{ + return __find_workspace(type, false); +} + /* * put a workspace struct back on the list or free it if we have enough * idle ones sitting around */ -static void free_workspace(int type, struct list_head *workspace) +static void __free_workspace(int type, struct list_head *workspace, + bool heuristic) { int idx = type - 1; - struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; - spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; - wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *free_ws = &btrfs_comp_ws[idx].free_ws; + struct list_head *idle_ws; + spinlock_t *ws_lock; + atomic_t *total_ws; + wait_queue_head_t *ws_wait; + int *free_ws; + + if (heuristic) { + idle_ws = &btrfs_heuristic_ws.idle_ws; + ws_lock = &btrfs_heuristic_ws.ws_lock; + total_ws = &btrfs_heuristic_ws.total_ws; + ws_wait = &btrfs_heuristic_ws.ws_wait; + free_ws = &btrfs_heuristic_ws.free_ws; + } else { + idle_ws = &btrfs_comp_ws[idx].idle_ws; + ws_lock = &btrfs_comp_ws[idx].ws_lock; + total_ws = &btrfs_comp_ws[idx].total_ws; + ws_wait = &btrfs_comp_ws[idx].ws_wait; + free_ws = &btrfs_comp_ws[idx].free_ws; + } spin_lock(ws_lock); if (*free_ws <= num_online_cpus()) { @@ -834,7 +987,10 @@ static void free_workspace(int type, struct list_head *workspace) } spin_unlock(ws_lock); - btrfs_compress_op[idx]->free_workspace(workspace); + if (heuristic) + free_heuristic_ws(workspace); + else + btrfs_compress_op[idx]->free_workspace(workspace); atomic_dec(total_ws); wake: /* @@ -845,6 +1001,11 @@ wake: wake_up(ws_wait); } +static void free_workspace(int type, struct list_head *ws) +{ + return __free_workspace(type, ws, false); +} + /* * cleanup function for module exit */ @@ -853,6 +1014,13 @@ static void free_workspaces(void) struct list_head *workspace; int i; + while (!list_empty(&btrfs_heuristic_ws.idle_ws)) { + workspace = btrfs_heuristic_ws.idle_ws.next; + list_del(workspace); + free_heuristic_ws(workspace); + atomic_dec(&btrfs_heuristic_ws.total_ws); + } + for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { workspace = btrfs_comp_ws[i].idle_ws.next; @@ -867,6 +1035,11 @@ static void free_workspaces(void) * Given an address space and start and length, compress the bytes into @pages * that are allocated on demand. * + * @type_level is encoded algorithm and level, where level 0 means whatever + * default the algorithm chooses and is opaque here; + * - compression algo are 0-3 + * - the level are bits 4-7 + * * @out_pages is an in/out parameter, holds maximum number of pages to allocate * and returns number of actually allocated pages * @@ -881,7 +1054,7 @@ static void free_workspaces(void) * @max_out tells us the max number of bytes that we're allowed to * stuff into pages */ -int btrfs_compress_pages(int type, struct address_space *mapping, +int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, @@ -889,9 +1062,11 @@ int btrfs_compress_pages(int type, struct address_space *mapping, { struct list_head *workspace; int ret; + int type = type_level & 0xF; workspace = find_workspace(type); + btrfs_compress_op[type - 1]->set_level(workspace, type_level); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, pages, out_pages, @@ -1050,6 +1225,211 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, } /* + * Shannon Entropy calculation + * + * Pure byte distribution analysis fails to determine compressiability of data. + * Try calculating entropy to estimate the average minimum number of bits + * needed to encode the sampled data. + * + * For convenience, return the percentage of needed bits, instead of amount of + * bits directly. + * + * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy + * and can be compressible with high probability + * + * @ENTROPY_LVL_HIGH - data are not compressible with high probability + * + * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. + */ +#define ENTROPY_LVL_ACEPTABLE (65) +#define ENTROPY_LVL_HIGH (80) + +/* + * For increasead precision in shannon_entropy calculation, + * let's do pow(n, M) to save more digits after comma: + * + * - maximum int bit length is 64 + * - ilog2(MAX_SAMPLE_SIZE) -> 13 + * - 13 * 4 = 52 < 64 -> M = 4 + * + * So use pow(n, 4). + */ +static inline u32 ilog2_w(u64 n) +{ + return ilog2(n * n * n * n); +} + +static u32 shannon_entropy(struct heuristic_ws *ws) +{ + const u32 entropy_max = 8 * ilog2_w(2); + u32 entropy_sum = 0; + u32 p, p_base, sz_base; + u32 i; + + sz_base = ilog2_w(ws->sample_size); + for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { + p = ws->bucket[i].count; + p_base = ilog2_w(p); + entropy_sum += p * (sz_base - p_base); + } + + entropy_sum /= ws->sample_size; + return entropy_sum * 100 / entropy_max; +} + +/* Compare buckets by size, ascending */ +static int bucket_comp_rev(const void *lv, const void *rv) +{ + const struct bucket_item *l = (const struct bucket_item *)lv; + const struct bucket_item *r = (const struct bucket_item *)rv; + + return r->count - l->count; +} + +/* + * Size of the core byte set - how many bytes cover 90% of the sample + * + * There are several types of structured binary data that use nearly all byte + * values. The distribution can be uniform and counts in all buckets will be + * nearly the same (eg. encrypted data). Unlikely to be compressible. + * + * Other possibility is normal (Gaussian) distribution, where the data could + * be potentially compressible, but we have to take a few more steps to decide + * how much. + * + * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, + * compression algo can easy fix that + * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high + * probability is not compressible + */ +#define BYTE_CORE_SET_LOW (64) +#define BYTE_CORE_SET_HIGH (200) + +static int byte_core_set_size(struct heuristic_ws *ws) +{ + u32 i; + u32 coreset_sum = 0; + const u32 core_set_threshold = ws->sample_size * 90 / 100; + struct bucket_item *bucket = ws->bucket; + + /* Sort in reverse order */ + sort(bucket, BUCKET_SIZE, sizeof(*bucket), &bucket_comp_rev, NULL); + + for (i = 0; i < BYTE_CORE_SET_LOW; i++) + coreset_sum += bucket[i].count; + + if (coreset_sum > core_set_threshold) + return i; + + for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { + coreset_sum += bucket[i].count; + if (coreset_sum > core_set_threshold) + break; + } + + return i; +} + +/* + * Count byte values in buckets. + * This heuristic can detect textual data (configs, xml, json, html, etc). + * Because in most text-like data byte set is restricted to limited number of + * possible characters, and that restriction in most cases makes data easy to + * compress. + * + * @BYTE_SET_THRESHOLD - consider all data within this byte set size: + * less - compressible + * more - need additional analysis + */ +#define BYTE_SET_THRESHOLD (64) + +static u32 byte_set_size(const struct heuristic_ws *ws) +{ + u32 i; + u32 byte_set_size = 0; + + for (i = 0; i < BYTE_SET_THRESHOLD; i++) { + if (ws->bucket[i].count > 0) + byte_set_size++; + } + + /* + * Continue collecting count of byte values in buckets. If the byte + * set size is bigger then the threshold, it's pointless to continue, + * the detection technique would fail for this type of data. + */ + for (; i < BUCKET_SIZE; i++) { + if (ws->bucket[i].count > 0) { + byte_set_size++; + if (byte_set_size > BYTE_SET_THRESHOLD) + return byte_set_size; + } + } + + return byte_set_size; +} + +static bool sample_repeated_patterns(struct heuristic_ws *ws) +{ + const u32 half_of_sample = ws->sample_size / 2; + const u8 *data = ws->sample; + + return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; +} + +static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, + struct heuristic_ws *ws) +{ + struct page *page; + u64 index, index_end; + u32 i, curr_sample_pos; + u8 *in_data; + + /* + * Compression handles the input data by chunks of 128KiB + * (defined by BTRFS_MAX_UNCOMPRESSED) + * + * We do the same for the heuristic and loop over the whole range. + * + * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will + * process no more than BTRFS_MAX_UNCOMPRESSED at a time. + */ + if (end - start > BTRFS_MAX_UNCOMPRESSED) + end = start + BTRFS_MAX_UNCOMPRESSED; + + index = start >> PAGE_SHIFT; + index_end = end >> PAGE_SHIFT; + + /* Don't miss unaligned end */ + if (!IS_ALIGNED(end, PAGE_SIZE)) + index_end++; + + curr_sample_pos = 0; + while (index < index_end) { + page = find_get_page(inode->i_mapping, index); + in_data = kmap(page); + /* Handle case where the start is not aligned to PAGE_SIZE */ + i = start % PAGE_SIZE; + while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { + /* Don't sample any garbage from the last page */ + if (start > end - SAMPLING_READ_SIZE) + break; + memcpy(&ws->sample[curr_sample_pos], &in_data[i], + SAMPLING_READ_SIZE); + i += SAMPLING_INTERVAL; + start += SAMPLING_INTERVAL; + curr_sample_pos += SAMPLING_READ_SIZE; + } + kunmap(page); + put_page(page); + + index++; + } + + ws->sample_size = curr_sample_pos; +} + +/* * Compression heuristic. * * For now is's a naive and optimistic 'return true', we'll extend the logic to @@ -1066,18 +1446,87 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, */ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) { - u64 index = start >> PAGE_SHIFT; - u64 end_index = end >> PAGE_SHIFT; - struct page *page; - int ret = 1; + struct list_head *ws_list = __find_workspace(0, true); + struct heuristic_ws *ws; + u32 i; + u8 byte; + int ret = 0; - while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - kmap(page); - kunmap(page); - put_page(page); - index++; + ws = list_entry(ws_list, struct heuristic_ws, list); + + heuristic_collect_sample(inode, start, end, ws); + + if (sample_repeated_patterns(ws)) { + ret = 1; + goto out; + } + + memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); + + for (i = 0; i < ws->sample_size; i++) { + byte = ws->sample[i]; + ws->bucket[byte].count++; + } + + i = byte_set_size(ws); + if (i < BYTE_SET_THRESHOLD) { + ret = 2; + goto out; + } + + i = byte_core_set_size(ws); + if (i <= BYTE_CORE_SET_LOW) { + ret = 3; + goto out; + } + + if (i >= BYTE_CORE_SET_HIGH) { + ret = 0; + goto out; + } + + i = shannon_entropy(ws); + if (i <= ENTROPY_LVL_ACEPTABLE) { + ret = 4; + goto out; + } + + /* + * For the levels below ENTROPY_LVL_HIGH, additional analysis would be + * needed to give green light to compression. + * + * For now just assume that compression at that level is not worth the + * resources because: + * + * 1. it is possible to defrag the data later + * + * 2. the data would turn out to be hardly compressible, eg. 150 byte + * values, every bucket has counter at level ~54. The heuristic would + * be confused. This can happen when data have some internal repeated + * patterns like "abbacbbc...". This can be detected by analyzing + * pairs of bytes, which is too costly. + */ + if (i < ENTROPY_LVL_HIGH) { + ret = 5; + goto out; + } else { + ret = 0; + goto out; } +out: + __free_workspace(0, ws_list, true); return ret; } + +unsigned int btrfs_compress_str2level(const char *str) +{ + if (strncmp(str, "zlib", 4) != 0) + return 0; + + /* Accepted form: zlib:1 up to zlib:9 and nothing left after the number */ + if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) + return str[5] - '0'; + + return 0; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d2781ff8f994..da20755ebf21 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -76,7 +76,7 @@ struct compressed_bio { void btrfs_init_compress(void); void btrfs_exit_compress(void); -int btrfs_compress_pages(int type, struct address_space *mapping, +int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, @@ -95,6 +95,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); +unsigned btrfs_compress_str2level(const char *str); + enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, @@ -124,6 +126,8 @@ struct btrfs_compress_op { struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); + + void (*set_level)(struct list_head *ws, unsigned int type); }; extern const struct btrfs_compress_op btrfs_zlib_compress; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d49db7d86be..531e0a8645b0 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -192,7 +192,7 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * tree until you end up with a lock on the root. A locked buffer * is returned, with a reference held. */ -static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; @@ -5496,8 +5496,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } else if (left_end_reached) { if (right_level == 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, ctx); @@ -5508,8 +5507,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, continue; } else if (right_end_reached) { if (left_level == 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, ctx); @@ -5523,8 +5521,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, if (left_level == 0 && right_level == 0) { cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, ctx); @@ -5532,8 +5529,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; advance_left = ADVANCE; } else if (cmp > 0) { - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, ctx); @@ -5550,8 +5546,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, result = BTRFS_COMPARE_TREE_CHANGED; else result = BTRFS_COMPARE_TREE_SAME; - ret = changed_cb(left_root, right_root, - left_path, right_path, + ret = changed_cb(left_path, right_path, &left_key, result, ctx); if (ret < 0) goto out; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5a8933da39a7..f7df5536ab61 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -523,7 +523,7 @@ struct btrfs_caching_control { }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ -#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2) +#define CACHING_CTL_WAKE_UP SZ_2M struct btrfs_io_ctl { void *cur, *orig; @@ -709,7 +709,6 @@ struct btrfs_delayed_root; #define BTRFS_FS_OPEN 5 #define BTRFS_FS_QUOTA_ENABLED 6 #define BTRFS_FS_QUOTA_ENABLING 7 -#define BTRFS_FS_QUOTA_DISABLING 8 #define BTRFS_FS_UPDATE_UUID_TREE_GEN 9 #define BTRFS_FS_CREATING_FREE_SPACE_TREE 10 #define BTRFS_FS_BTREE_ERR 11 @@ -723,7 +722,7 @@ struct btrfs_delayed_root; * Indicate that a whole-filesystem exclusive operation is running * (device replace, resize, device add/delete, balance) */ -#define BTRFS_FS_EXCL_OP 14 +#define BTRFS_FS_EXCL_OP 16 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -764,8 +763,6 @@ struct btrfs_fs_info { * delayed dir index item */ struct btrfs_block_rsv global_block_rsv; - /* block reservation for delay allocation */ - struct btrfs_block_rsv delalloc_block_rsv; /* block reservation for metadata operations */ struct btrfs_block_rsv trans_block_rsv; /* block reservation for chunk tree */ @@ -791,6 +788,7 @@ struct btrfs_fs_info { */ unsigned long pending_changes; unsigned long compress_type:4; + unsigned int compress_level; int commit_interval; /* * It is a suggestive number, the read side is safe even it gets a @@ -879,9 +877,6 @@ struct btrfs_fs_info { rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; - atomic_t nr_async_submits; - atomic_t async_submit_draining; - atomic_t nr_async_bios; atomic_t async_delalloc_pages; atomic_t open_ioctl_trans; @@ -1101,6 +1096,11 @@ struct btrfs_fs_info { u32 nodesize; u32 sectorsize; u32 stripesize; + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; + struct rb_root block_tree; +#endif }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) @@ -1339,6 +1339,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) +#define BTRFS_MOUNT_REF_VERIFY (1 << 28) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) @@ -2640,7 +2641,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, + struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, @@ -2659,7 +2660,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); @@ -2671,7 +2672,7 @@ void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); @@ -2745,6 +2746,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, u64 *qgroup_reserved, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); + int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, @@ -2752,6 +2755,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode, void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); @@ -2810,6 +2816,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); @@ -2822,9 +2829,7 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; -typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root, - struct btrfs_root *right_root, - struct btrfs_path *left_path, +typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 19e4ad2f3f2e..5d73f79ded8b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -581,7 +581,6 @@ static int btrfs_delayed_inode_reserve_metadata( struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; - bool release = false; src_rsv = trans->block_rsv; dst_rsv = &fs_info->delayed_block_rsv; @@ -589,36 +588,13 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); /* - * If our block_rsv is the delalloc block reserve then check and see if - * we have our extra reservation for updating the inode. If not fall - * through and try to reserve space quickly. - * - * We used to try and steal from the delalloc block rsv or the global - * reserve, but we'd steal a full reservation, which isn't kind. We are - * here through delalloc which means we've likely just cowed down close - * to the leaf that contains the inode, so we would steal less just - * doing the fallback inode update, so if we do end up having to steal - * from the global block rsv we hopefully only steal one or two blocks - * worth which is less likely to hurt us. - */ - if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { - spin_lock(&inode->lock); - if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) - release = true; - else - src_rsv = NULL; - spin_unlock(&inode->lock); - } - - /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we * still need to reserve space for this update, so try to reserve the * space. * * Now if src_rsv == delalloc_block_rsv we'll let it just steal since - * we're accounted for. + * we always reserve enough to update the inode item. */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { @@ -643,32 +619,12 @@ static int btrfs_delayed_inode_reserve_metadata( } ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); - - /* - * Migrate only takes a reservation, it doesn't touch the size of the - * block_rsv. This is to simplify people who don't normally have things - * migrated from their block rsv. If they go to release their - * reservation, that will decrease the size as well, so if migrate - * reduced size we'd end up with a negative size. But for the - * delalloc_meta_reserved stuff we will only know to drop 1 reservation, - * but we could in fact do this reserve/migrate dance several times - * between the time we did the original reservation and we'd clean it - * up. So to take care of this, release the space for the meta - * reservation here. I think it may be time for a documentation page on - * how block rsvs. work. - */ if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", btrfs_ino(inode), num_bytes, 1); node->bytes_reserved = num_bytes; } - if (release) { - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), num_bytes, 0); - btrfs_block_rsv_release(fs_info, src_rsv, num_bytes); - } - return ret; } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 93ffa898df6d..83be8f9fd906 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -40,10 +40,10 @@ struct kmem_cache *btrfs_delayed_extent_op_cachep; /* * compare two delayed tree backrefs with same bytenr and type */ -static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, - struct btrfs_delayed_tree_ref *ref1, int type) +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1, + struct btrfs_delayed_tree_ref *ref2) { - if (type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { if (ref1->root < ref2->root) return -1; if (ref1->root > ref2->root) @@ -60,8 +60,8 @@ static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, /* * compare two delayed data backrefs with same bytenr and type */ -static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, - struct btrfs_delayed_data_ref *ref1) +static int comp_data_refs(struct btrfs_delayed_data_ref *ref1, + struct btrfs_delayed_data_ref *ref2) { if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { if (ref1->root < ref2->root) @@ -85,6 +85,34 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, return 0; } +static int comp_refs(struct btrfs_delayed_ref_node *ref1, + struct btrfs_delayed_ref_node *ref2, + bool check_seq) +{ + int ret = 0; + + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1), + btrfs_delayed_node_to_tree_ref(ref2)); + else + ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1), + btrfs_delayed_node_to_data_ref(ref2)); + if (ret) + return ret; + if (check_seq) { + if (ref1->seq < ref2->seq) + return -1; + if (ref1->seq > ref2->seq) + return 1; + } + return 0; +} + /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, struct rb_node *node) @@ -96,15 +124,43 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, u64 bytenr; ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node); - bytenr = ins->node.bytenr; + bytenr = ins->bytenr; while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->node.bytenr) + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(node, parent_node, p); + rb_insert_color(node, root); + return NULL; +} + +static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root, + struct btrfs_delayed_ref_node *ins) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *node = &ins->ref_node; + struct rb_node *parent_node = NULL; + struct btrfs_delayed_ref_node *entry; + + while (*p) { + int comp; + + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, + ref_node); + comp = comp_refs(ins, entry, true); + if (comp < 0) p = &(*p)->rb_left; - else if (bytenr > entry->node.bytenr) + else if (comp > 0) p = &(*p)->rb_right; else return entry; @@ -133,15 +189,15 @@ find_ref_head(struct rb_root *root, u64 bytenr, while (n) { entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node); - if (bytenr < entry->node.bytenr) + if (bytenr < entry->bytenr) n = n->rb_left; - else if (bytenr > entry->node.bytenr) + else if (bytenr > entry->bytenr) n = n->rb_right; else return entry; } if (entry && return_bigger) { - if (bytenr > entry->node.bytenr) { + if (bytenr > entry->bytenr) { n = rb_next(&entry->href_node); if (!n) n = rb_first(root); @@ -164,17 +220,17 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, if (mutex_trylock(&head->mutex)) return 0; - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); spin_lock(&delayed_refs->lock); - if (!head->node.in_tree) { + if (RB_EMPTY_NODE(&head->href_node)) { mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return -EAGAIN; } - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return 0; } @@ -183,15 +239,11 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { - if (btrfs_delayed_ref_is_head(ref)) { - head = btrfs_delayed_node_to_head(ref); - rb_erase(&head->href_node, &delayed_refs->href_root); - } else { - assert_spin_locked(&head->lock); - list_del(&ref->list); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); - } + assert_spin_locked(&head->lock); + rb_erase(&ref->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); ref->in_tree = 0; btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); @@ -206,36 +258,18 @@ static bool merge_ref(struct btrfs_trans_handle *trans, u64 seq) { struct btrfs_delayed_ref_node *next; + struct rb_node *node = rb_next(&ref->ref_node); bool done = false; - next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, - list); - while (!done && &next->list != &head->ref_list) { + while (!done && node) { int mod; - struct btrfs_delayed_ref_node *next2; - - next2 = list_next_entry(next, list); - - if (next == ref) - goto next; + next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); + node = rb_next(node); if (seq && next->seq >= seq) - goto next; - - if (next->type != ref->type) - goto next; - - if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY || - ref->type == BTRFS_SHARED_BLOCK_REF_KEY) && - comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref), - btrfs_delayed_node_to_tree_ref(next), - ref->type)) - goto next; - if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY || - ref->type == BTRFS_SHARED_DATA_REF_KEY) && - comp_data_refs(btrfs_delayed_node_to_data_ref(ref), - btrfs_delayed_node_to_data_ref(next))) - goto next; + break; + if (comp_refs(ref, next, false)) + break; if (ref->action == next->action) { mod = next->ref_mod; @@ -259,8 +293,6 @@ static bool merge_ref(struct btrfs_trans_handle *trans, WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ref->type == BTRFS_SHARED_BLOCK_REF_KEY); } -next: - next = next2; } return done; @@ -272,11 +304,12 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; + struct rb_node *node; u64 seq = 0; assert_spin_locked(&head->lock); - if (list_empty(&head->ref_list)) + if (RB_EMPTY_ROOT(&head->ref_tree)) return; /* We don't have too many refs to merge for data. */ @@ -293,22 +326,13 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, } spin_unlock(&fs_info->tree_mod_seq_lock); - ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, - list); - while (&ref->list != &head->ref_list) { +again: + for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) - goto next; - - if (merge_ref(trans, delayed_refs, head, ref, seq)) { - if (list_empty(&head->ref_list)) - break; - ref = list_first_entry(&head->ref_list, - struct btrfs_delayed_ref_node, - list); continue; - } -next: - ref = list_next_entry(ref, list); + if (merge_ref(trans, delayed_refs, head, ref, seq)) + goto again; } } @@ -380,8 +404,8 @@ again: head->processing = 1; WARN_ON(delayed_refs->num_heads_ready == 0); delayed_refs->num_heads_ready--; - delayed_refs->run_delayed_start = head->node.bytenr + - head->node.num_bytes; + delayed_refs->run_delayed_start = head->bytenr + + head->num_bytes; return head; } @@ -391,37 +415,19 @@ again: * Return 0 for insert. * Return >0 for merge. */ -static int -add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *root, - struct btrfs_delayed_ref_head *href, - struct btrfs_delayed_ref_node *ref) +static int insert_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { struct btrfs_delayed_ref_node *exist; int mod; int ret = 0; spin_lock(&href->lock); - /* Check whether we can merge the tail node with ref */ - if (list_empty(&href->ref_list)) - goto add_tail; - exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, - list); - /* No need to compare bytenr nor is_head */ - if (exist->type != ref->type || exist->seq != ref->seq) - goto add_tail; - - if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || - exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && - comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), - btrfs_delayed_node_to_tree_ref(ref), - ref->type)) - goto add_tail; - if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || - exist->type == BTRFS_SHARED_DATA_REF_KEY) && - comp_data_refs(btrfs_delayed_node_to_data_ref(exist), - btrfs_delayed_node_to_data_ref(ref))) - goto add_tail; + exist = tree_insert(&href->ref_tree, ref); + if (!exist) + goto inserted; /* Now we are sure we can merge */ ret = 1; @@ -452,9 +458,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, drop_delayed_ref(trans, root, href, exist); spin_unlock(&href->lock); return ret; - -add_tail: - list_add_tail(&ref->list, &href->ref_list); +inserted: if (ref->action == BTRFS_ADD_DELAYED_REF) list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); @@ -469,20 +473,16 @@ add_tail: */ static noinline void update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update, + struct btrfs_delayed_ref_head *existing, + struct btrfs_delayed_ref_head *update, int *old_ref_mod_ret) { - struct btrfs_delayed_ref_head *existing_ref; - struct btrfs_delayed_ref_head *ref; int old_ref_mod; - existing_ref = btrfs_delayed_node_to_head(existing); - ref = btrfs_delayed_node_to_head(update); - BUG_ON(existing_ref->is_data != ref->is_data); + BUG_ON(existing->is_data != update->is_data); - spin_lock(&existing_ref->lock); - if (ref->must_insert_reserved) { + spin_lock(&existing->lock); + if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref * entries were processed, we can end up @@ -490,7 +490,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * the must_insert_reserved flag set. * Set it again here */ - existing_ref->must_insert_reserved = ref->must_insert_reserved; + existing->must_insert_reserved = update->must_insert_reserved; /* * update the num_bytes so we make sure the accounting @@ -500,22 +500,22 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, } - if (ref->extent_op) { - if (!existing_ref->extent_op) { - existing_ref->extent_op = ref->extent_op; + if (update->extent_op) { + if (!existing->extent_op) { + existing->extent_op = update->extent_op; } else { - if (ref->extent_op->update_key) { - memcpy(&existing_ref->extent_op->key, - &ref->extent_op->key, - sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = true; + if (update->extent_op->update_key) { + memcpy(&existing->extent_op->key, + &update->extent_op->key, + sizeof(update->extent_op->key)); + existing->extent_op->update_key = true; } - if (ref->extent_op->update_flags) { - existing_ref->extent_op->flags_to_set |= - ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = true; + if (update->extent_op->update_flags) { + existing->extent_op->flags_to_set |= + update->extent_op->flags_to_set; + existing->extent_op->update_flags = true; } - btrfs_free_delayed_extent_op(ref->extent_op); + btrfs_free_delayed_extent_op(update->extent_op); } } /* @@ -523,23 +523,23 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ - old_ref_mod = existing_ref->total_ref_mod; + old_ref_mod = existing->total_ref_mod; if (old_ref_mod_ret) *old_ref_mod_ret = old_ref_mod; existing->ref_mod += update->ref_mod; - existing_ref->total_ref_mod += update->ref_mod; + existing->total_ref_mod += update->ref_mod; /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. */ - if (existing_ref->is_data) { - if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) + if (existing->is_data) { + if (existing->total_ref_mod >= 0 && old_ref_mod < 0) delayed_refs->pending_csums -= existing->num_bytes; - if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) + if (existing->total_ref_mod < 0 && old_ref_mod >= 0) delayed_refs->pending_csums += existing->num_bytes; } - spin_unlock(&existing_ref->lock); + spin_unlock(&existing->lock); } /* @@ -550,14 +550,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, + struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, int action, int is_data, int *qrecord_inserted_ret, int *old_ref_mod, int *new_ref_mod) { struct btrfs_delayed_ref_head *existing; - struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; int must_insert_reserved = 0; @@ -593,26 +592,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; - /* first set the basic ref node struct up */ - refcount_set(&ref->refs, 1); - ref->bytenr = bytenr; - ref->num_bytes = num_bytes; - ref->ref_mod = count_mod; - ref->type = 0; - ref->action = 0; - ref->is_head = 1; - ref->in_tree = 1; - ref->seq = 0; - - head_ref = btrfs_delayed_node_to_head(ref); + refcount_set(&head_ref->refs, 1); + head_ref->bytenr = bytenr; + head_ref->num_bytes = num_bytes; + head_ref->ref_mod = count_mod; head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; - INIT_LIST_HEAD(&head_ref->ref_list); + head_ref->ref_tree = RB_ROOT; INIT_LIST_HEAD(&head_ref->ref_add_list); + RB_CLEAR_NODE(&head_ref->href_node); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; head_ref->qgroup_reserved = 0; head_ref->qgroup_ref_root = 0; + spin_lock_init(&head_ref->lock); + mutex_init(&head_ref->mutex); /* Record qgroup extent info if provided */ if (qrecord) { @@ -632,17 +626,14 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord_inserted = 1; } - spin_lock_init(&head_ref->lock); - mutex_init(&head_ref->mutex); - - trace_add_delayed_ref_head(fs_info, ref, head_ref, action); + trace_add_delayed_ref_head(fs_info, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { WARN_ON(ref_root && reserved && existing->qgroup_ref_root && existing->qgroup_reserved); - update_existing_head_ref(delayed_refs, &existing->node, ref, + update_existing_head_ref(delayed_refs, existing, head_ref, old_ref_mod); /* * we've updated the existing ref, free the newly @@ -699,7 +690,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; - INIT_LIST_HEAD(&ref->list); + RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -713,7 +704,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); - ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); /* * XXX: memory should be freed at the same level allocated. @@ -756,7 +747,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; - INIT_LIST_HEAD(&ref->list); + RB_CLEAR_NODE(&ref->ref_node); INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -772,8 +763,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(fs_info, ref, full_ref, action); - ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); - + ret = insert_delayed_ref(trans, delayed_refs, head_ref, ref); if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); } @@ -821,7 +811,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, 0, 0, action, 0, &qrecord_inserted, old_ref_mod, new_ref_mod); @@ -888,7 +878,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, + head_ref = add_delayed_ref_head(fs_info, trans, head_ref, record, bytenr, num_bytes, ref_root, reserved, action, 1, &qrecord_inserted, old_ref_mod, new_ref_mod); @@ -920,7 +910,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + add_delayed_ref_head(fs_info, trans, head_ref, NULL, bytenr, num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, NULL, NULL, NULL); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index ce88e4ac5276..a43af432f859 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -26,18 +26,8 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ -/* - * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the - * same ref_node structure. - * Ref_head is in a higher logic level than tree/data ref, and duplicated - * bytenr/num_bytes in ref_node is really a waste or memory, they should be - * referred from ref_head. - * This gets more disgusting after we use list to store tree/data ref in - * ref_head. Must clean this mess up later. - */ struct btrfs_delayed_ref_node { - /*data/tree ref use list, stored in ref_head->ref_list. */ - struct list_head list; + struct rb_node ref_node; /* * If action is BTRFS_ADD_DELAYED_REF, also link this node to * ref_head->ref_add_list, then we do not need to iterate the @@ -91,8 +81,9 @@ struct btrfs_delayed_extent_op { * reference count modifications we've queued up. */ struct btrfs_delayed_ref_head { - struct btrfs_delayed_ref_node node; - + u64 bytenr; + u64 num_bytes; + refcount_t refs; /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. @@ -100,7 +91,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct list_head ref_list; + struct rb_root ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; @@ -116,6 +107,14 @@ struct btrfs_delayed_ref_head { int total_ref_mod; /* + * This is the current outstanding mod references for this bytenr. This + * is used with lookup_extent_info to get an accurate reference count + * for a bytenr, so it is adjusted as delayed refs are run so that any + * on disk reference count + ref_mod is accurate. + */ + int ref_mod; + + /* * For qgroup reserved space freeing. * * ref_root and reserved will be recorded after @@ -234,15 +233,18 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) case BTRFS_SHARED_DATA_REF_KEY: kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); break; - case 0: - kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); - break; default: BUG(); } } } +static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head) +{ + if (refcount_dec_and_test(&head->refs)) + kmem_cache_free(btrfs_delayed_ref_head_cachep, head); +} + int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, @@ -283,35 +285,17 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); /* - * a node might live in a head or a regular ref, this lets you - * test for the proper type to use. - */ -static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) -{ - return node->is_head; -} - -/* * helper functions to cast a node into its container */ static inline struct btrfs_delayed_tree_ref * btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) { - WARN_ON(btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_tree_ref, node); } static inline struct btrfs_delayed_data_ref * btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) { - WARN_ON(btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_data_ref, node); } - -static inline struct btrfs_delayed_ref_head * -btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) -{ - WARN_ON(!btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_ref_head, node); -} #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 487bbe4fb3c6..efce9a2fa9be 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,6 +50,8 @@ #include "sysfs.h" #include "qgroup.h" #include "compression.h" +#include "tree-checker.h" +#include "ref-verify.h" #ifdef CONFIG_X86 #include <asm/cpufeature.h> @@ -543,146 +545,6 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, return ret; } -#define CORRUPT(reason, eb, root, slot) \ - btrfs_crit(root->fs_info, \ - "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \ - btrfs_header_level(eb) == 0 ? "leaf" : "node", \ - reason, btrfs_header_bytenr(eb), root->objectid, slot) - -static noinline int check_leaf(struct btrfs_root *root, - struct extent_buffer *leaf) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_key key; - struct btrfs_key leaf_key; - u32 nritems = btrfs_header_nritems(leaf); - int slot; - - /* - * Extent buffers from a relocation tree have a owner field that - * corresponds to the subvolume tree they are based on. So just from an - * extent buffer alone we can not find out what is the id of the - * corresponding subvolume tree, so we can not figure out if the extent - * buffer corresponds to the root of the relocation tree or not. So skip - * this check for relocation trees. - */ - if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { - struct btrfs_root *check_root; - - key.objectid = btrfs_header_owner(leaf); - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - check_root = btrfs_get_fs_root(fs_info, &key, false); - /* - * The only reason we also check NULL here is that during - * open_ctree() some roots has not yet been set up. - */ - if (!IS_ERR_OR_NULL(check_root)) { - struct extent_buffer *eb; - - eb = btrfs_root_node(check_root); - /* if leaf is the root, then it's fine */ - if (leaf != eb) { - CORRUPT("non-root leaf's nritems is 0", - leaf, check_root, 0); - free_extent_buffer(eb); - return -EIO; - } - free_extent_buffer(eb); - } - return 0; - } - - if (nritems == 0) - return 0; - - /* Check the 0 item */ - if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != - BTRFS_LEAF_DATA_SIZE(fs_info)) { - CORRUPT("invalid item offset size pair", leaf, root, 0); - return -EIO; - } - - /* - * Check to make sure each items keys are in the correct order and their - * offsets make sense. We only have to loop through nritems-1 because - * we check the current slot against the next slot, which verifies the - * next slot's offset+size makes sense and that the current's slot - * offset is correct. - */ - for (slot = 0; slot < nritems - 1; slot++) { - btrfs_item_key_to_cpu(leaf, &leaf_key, slot); - btrfs_item_key_to_cpu(leaf, &key, slot + 1); - - /* Make sure the keys are in the right order */ - if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { - CORRUPT("bad key order", leaf, root, slot); - return -EIO; - } - - /* - * Make sure the offset and ends are right, remember that the - * item data starts at the end of the leaf and grows towards the - * front. - */ - if (btrfs_item_offset_nr(leaf, slot) != - btrfs_item_end_nr(leaf, slot + 1)) { - CORRUPT("slot offset bad", leaf, root, slot); - return -EIO; - } - - /* - * Check to make sure that we don't point outside of the leaf, - * just in case all the items are consistent to each other, but - * all point outside of the leaf. - */ - if (btrfs_item_end_nr(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(fs_info)) { - CORRUPT("slot end outside of leaf", leaf, root, slot); - return -EIO; - } - } - - return 0; -} - -static int check_node(struct btrfs_root *root, struct extent_buffer *node) -{ - unsigned long nr = btrfs_header_nritems(node); - struct btrfs_key key, next_key; - int slot; - u64 bytenr; - int ret = 0; - - if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { - btrfs_crit(root->fs_info, - "corrupt node: block %llu root %llu nritems %lu", - node->start, root->objectid, nr); - return -EIO; - } - - for (slot = 0; slot < nr - 1; slot++) { - bytenr = btrfs_node_blockptr(node, slot); - btrfs_node_key_to_cpu(node, &key, slot); - btrfs_node_key_to_cpu(node, &next_key, slot + 1); - - if (!bytenr) { - CORRUPT("invalid item slot", node, root, slot); - ret = -EIO; - goto out; - } - - if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { - CORRUPT("bad key order", node, root, slot); - ret = -EIO; - goto out; - } - } -out: - return ret; -} - static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -748,12 +610,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && check_leaf(root, eb)) { + if (found_level == 0 && btrfs_check_leaf(root, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } - if (found_level > 0 && check_node(root, eb)) + if (found_level > 0 && btrfs_check_node(root, eb)) ret = -EIO; if (!ret) @@ -879,22 +741,9 @@ static void run_one_async_start(struct btrfs_work *work) static void run_one_async_done(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; struct async_submit_bio *async; - int limit; async = container_of(work, struct async_submit_bio, work); - fs_info = async->fs_info; - - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; - - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ - if (atomic_dec_return(&fs_info->nr_async_submits) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); /* If an error occurred we just want to clean up the bio and move on */ if (async->status) { @@ -942,19 +791,10 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, async->status = 0; - atomic_inc(&fs_info->nr_async_submits); - if (op_is_sync(bio->bi_opf)) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); - - while (atomic_read(&fs_info->async_submit_draining) && - atomic_read(&fs_info->nr_async_submits)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0)); - } - return 0; } @@ -1005,9 +845,9 @@ static blk_status_t __btree_submit_bio_done(void *private_data, struct bio *bio, return ret; } -static int check_async_write(unsigned long bio_flags) +static int check_async_write(struct btrfs_inode *bi) { - if (bio_flags & EXTENT_BIO_TREE_LOG) + if (atomic_read(&bi->sync_writers)) return 0; #ifdef CONFIG_X86 if (static_cpu_has(X86_FEATURE_XMM4_2)) @@ -1022,7 +862,7 @@ static blk_status_t btree_submit_bio_hook(void *private_data, struct bio *bio, { struct inode *inode = private_data; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int async = check_async_write(bio_flags); + int async = check_async_write(BTRFS_I(inode)); blk_status_t ret; if (bio_op(bio) != REQ_OP_WRITE) { @@ -2607,14 +2447,6 @@ int open_ctree(struct super_block *sb, goto fail_delalloc_bytes; } - fs_info->btree_inode = new_inode(sb); - if (!fs_info->btree_inode) { - err = -ENOMEM; - goto fail_bio_counter; - } - - mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); @@ -2647,17 +2479,12 @@ int open_ctree(struct super_block *sb, btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); - btrfs_init_block_rsv(&fs_info->delalloc_block_rsv, - BTRFS_BLOCK_RSV_DELALLOC); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); - atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); - atomic_set(&fs_info->async_submit_draining, 0); - atomic_set(&fs_info->nr_async_bios, 0); atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); @@ -2673,12 +2500,21 @@ int open_ctree(struct super_block *sb, /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); spin_lock_init(&fs_info->reada_lock); + btrfs_init_ref_verify(fs_info); fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); INIT_LIST_HEAD(&fs_info->ordered_roots); spin_lock_init(&fs_info->ordered_root_lock); + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_bio_counter; + } + mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), GFP_KERNEL); if (!fs_info->delayed_root) { @@ -2895,12 +2731,13 @@ int open_ctree(struct super_block *sb, sb->s_bdi->congested_fn = btrfs_congested_fn; sb->s_bdi->congested_data = fs_info; sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); + memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE); mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(fs_info); @@ -3083,6 +2920,9 @@ retry_root_backup: if (ret) goto fail_trans_kthread; + if (btrfs_build_ref_tree(fs_info)) + btrfs_err(fs_info, "couldn't build ref tree"); + /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && !btrfs_test_opt(fs_info, NOLOGREPLAY)) { @@ -3643,7 +3483,14 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) u64 flags; do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); - backup_super_roots(fs_info); + + /* + * max_mirrors == 0 indicates we're from commit_transaction, + * not from fsync where the tree roots in fs_info have not + * been consistent on disk. + */ + if (max_mirrors == 0) + backup_super_roots(fs_info); sb = fs_info->super_for_commit; dev_item = &sb->dev_item; @@ -3941,6 +3788,7 @@ void close_ctree(struct btrfs_fs_info *fs_info) cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); + btrfs_free_ref_cache(fs_info); __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; @@ -4000,7 +3848,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->len, fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { + if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) { btrfs_print_leaf(buf); ASSERT(0); } @@ -4265,26 +4113,28 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_node *tmp; + struct rb_node *n; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); spin_lock(&delayed_refs->lock); continue; } spin_lock(&head->lock); - list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, - list) { + while ((n = rb_first(&head->ref_tree)) != NULL) { + ref = rb_entry(n, struct btrfs_delayed_ref_node, + ref_node); ref->in_tree = 0; - list_del(&ref->list); + rb_erase(&ref->ref_node, &head->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); if (!list_empty(&ref->add_list)) list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); @@ -4297,16 +4147,16 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, if (head->processing == 0) delayed_refs->num_heads_ready--; atomic_dec(&delayed_refs->num_entries); - head->node.in_tree = 0; rb_erase(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); mutex_unlock(&head->mutex); if (pin_bytes) - btrfs_pin_extent(fs_info, head->node.bytenr, - head->node.num_bytes, 1); - btrfs_put_delayed_ref(&head->node); + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); + btrfs_put_delayed_ref_head(head); cond_resched(); spin_lock(&delayed_refs->lock); } diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index fa66980726c9..3aeb5770f896 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/types.h> #include "ctree.h" diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 074348a95841..91b3908e7c54 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_EXPORT_H #define BTRFS_EXPORT_H diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e2d7e86b51d1..673ac4e01dd0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -26,6 +26,7 @@ #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/percpu_counter.h> +#include <linux/lockdep.h> #include "hash.h" #include "tree-log.h" #include "disk-io.h" @@ -38,6 +39,7 @@ #include "math.h" #include "sysfs.h" #include "qgroup.h" +#include "ref-verify.h" #undef SCRAMBLE_DELAYED_REFS @@ -61,9 +63,6 @@ enum { CHUNK_ALLOC_FORCE = 2, }; -static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, u64 parent, @@ -91,17 +90,8 @@ static int find_next_key(struct btrfs_path *path, int level, static void dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 ram_bytes, u64 num_bytes, int delalloc); -static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk); static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes); @@ -652,7 +642,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_FAST; spin_unlock(&cache->lock); - if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + if (btrfs_test_opt(fs_info, SPACE_CACHE)) { mutex_lock(&caching_ctl->mutex); ret = load_free_space_cache(fs_info, cache); @@ -923,7 +913,7 @@ search_again: head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -934,7 +924,7 @@ search_again: */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); goto search_again; } spin_lock(&head->lock); @@ -943,7 +933,7 @@ search_again: else BUG_ON(num_refs == 0); - num_refs += head->node.ref_mod; + num_refs += head->ref_mod; spin_unlock(&head->lock); mutex_unlock(&head->mutex); } @@ -2189,16 +2179,20 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + struct btrfs_fs_info *fs_info = root->fs_info; int old_ref_mod, new_ref_mod; int ret; BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && root_objectid == BTRFS_TREE_LOG_OBJECTID); + btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid, + owner, offset, BTRFS_ADD_DELAYED_REF); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, @@ -2344,7 +2338,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_ref_head *head, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; @@ -2366,14 +2360,14 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - key.objectid = node->bytenr; + key.objectid = head->bytenr; if (metadata) { key.type = BTRFS_METADATA_ITEM_KEY; key.offset = extent_op->level; } else { key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = node->num_bytes; + key.offset = head->num_bytes; } again: @@ -2390,17 +2384,17 @@ again: path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == node->bytenr && + if (key.objectid == head->bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == node->num_bytes) + key.offset == head->num_bytes) ret = 0; } if (ret > 0) { btrfs_release_path(path); metadata = 0; - key.objectid = node->bytenr; - key.offset = node->num_bytes; + key.objectid = head->bytenr; + key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; goto again; } @@ -2507,44 +2501,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return 0; } - if (btrfs_delayed_ref_is_head(node)) { - struct btrfs_delayed_ref_head *head; - /* - * we've hit the end of the chain and we were supposed - * to insert this extent into the tree. But, it got - * deleted before we ever needed to insert it, so all - * we have to do is clean up the accounting - */ - BUG_ON(extent_op); - head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(fs_info, node, head, node->action); - - if (head->total_ref_mod < 0) { - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(fs_info, node->bytenr); - ASSERT(cache); - percpu_counter_add(&cache->space_info->total_bytes_pinned, - -node->num_bytes); - btrfs_put_block_group(cache); - } - - if (insert_reserved) { - btrfs_pin_extent(fs_info, node->bytenr, - node->num_bytes, 1); - if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, - node->bytenr, - node->num_bytes); - } - } - - /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, - head->qgroup_reserved); - return ret; - } - if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, @@ -2563,7 +2519,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct btrfs_delayed_ref_node *ref; - if (list_empty(&head->ref_list)) + if (RB_EMPTY_ROOT(&head->ref_tree)) return NULL; /* @@ -2576,12 +2532,114 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) return list_first_entry(&head->ref_add_list, struct btrfs_delayed_ref_node, add_list); - ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, - list); + ref = rb_entry(rb_first(&head->ref_tree), + struct btrfs_delayed_ref_node, ref_node); ASSERT(list_empty(&ref->add_list)); return ref; } +static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + spin_lock(&delayed_refs->lock); + head->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(head); +} + +static int cleanup_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_extent_op *extent_op = head->extent_op; + int ret; + + if (!extent_op) + return 0; + head->extent_op = NULL; + if (head->must_insert_reserved) { + btrfs_free_delayed_extent_op(extent_op); + return 0; + } + spin_unlock(&head->lock); + ret = run_delayed_extent_op(trans, fs_info, head, extent_op); + btrfs_free_delayed_extent_op(extent_op); + return ret ? ret : 1; +} + +static int cleanup_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + delayed_refs = &trans->transaction->delayed_refs; + + ret = cleanup_extent_op(trans, fs_info, head); + if (ret < 0) { + unselect_delayed_ref_head(delayed_refs, head); + btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); + return ret; + } else if (ret) { + return ret; + } + + /* + * Need to drop our head ref lock and re-acquire the delayed ref lock + * and then re-check to make sure nobody got added. + */ + spin_unlock(&head->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&head->lock); + if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) { + spin_unlock(&head->lock); + spin_unlock(&delayed_refs->lock); + return 1; + } + delayed_refs->num_heads--; + rb_erase(&head->href_node, &delayed_refs->href_root); + RB_CLEAR_NODE(&head->href_node); + spin_unlock(&delayed_refs->lock); + spin_unlock(&head->lock); + atomic_dec(&delayed_refs->num_entries); + + trace_run_delayed_ref_head(fs_info, head, 0); + + if (head->total_ref_mod < 0) { + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(fs_info, head->bytenr); + ASSERT(cache); + percpu_counter_add(&cache->space_info->total_bytes_pinned, + -head->num_bytes); + btrfs_put_block_group(cache); + + if (head->is_data) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= head->num_bytes; + spin_unlock(&delayed_refs->lock); + } + } + + if (head->must_insert_reserved) { + btrfs_pin_extent(fs_info, head->bytenr, + head->num_bytes, 1); + if (head->is_data) { + ret = btrfs_del_csums(trans, fs_info, head->bytenr, + head->num_bytes); + } + } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, + head->qgroup_reserved); + btrfs_delayed_ref_unlock(head); + btrfs_put_delayed_ref_head(head); + return 0; +} + /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. @@ -2655,11 +2713,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { spin_unlock(&locked_ref->lock); - spin_lock(&delayed_refs->lock); - locked_ref->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(locked_ref); + unselect_delayed_ref_head(delayed_refs, locked_ref); locked_ref = NULL; cond_resched(); count++; @@ -2667,102 +2721,55 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } /* - * record the must insert reserved flag before we - * drop the spin lock. + * We're done processing refs in this ref_head, clean everything + * up and move on to the next ref_head. */ - must_insert_reserved = locked_ref->must_insert_reserved; - locked_ref->must_insert_reserved = 0; - - extent_op = locked_ref->extent_op; - locked_ref->extent_op = NULL; - if (!ref) { - - - /* All delayed refs have been processed, Go ahead - * and send the head node to run_one_delayed_ref, - * so that any accounting fixes can happen - */ - ref = &locked_ref->node; - - if (extent_op && must_insert_reserved) { - btrfs_free_delayed_extent_op(extent_op); - extent_op = NULL; - } - - if (extent_op) { - spin_unlock(&locked_ref->lock); - ret = run_delayed_extent_op(trans, fs_info, - ref, extent_op); - btrfs_free_delayed_extent_op(extent_op); - - if (ret) { - /* - * Need to reset must_insert_reserved if - * there was an error so the abort stuff - * can cleanup the reserved space - * properly. - */ - if (must_insert_reserved) - locked_ref->must_insert_reserved = 1; - spin_lock(&delayed_refs->lock); - locked_ref->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_debug(fs_info, - "run_delayed_extent_op returned %d", - ret); - btrfs_delayed_ref_unlock(locked_ref); - return ret; - } + ret = cleanup_ref_head(trans, fs_info, locked_ref); + if (ret > 0 ) { + /* We dropped our lock, we need to loop. */ + ret = 0; continue; + } else if (ret) { + return ret; } + locked_ref = NULL; + count++; + continue; + } - /* - * Need to drop our head ref lock and re-acquire the - * delayed ref lock and then re-check to make sure - * nobody got added. - */ - spin_unlock(&locked_ref->lock); - spin_lock(&delayed_refs->lock); - spin_lock(&locked_ref->lock); - if (!list_empty(&locked_ref->ref_list) || - locked_ref->extent_op) { - spin_unlock(&locked_ref->lock); - spin_unlock(&delayed_refs->lock); - continue; - } - ref->in_tree = 0; - delayed_refs->num_heads--; - rb_erase(&locked_ref->href_node, - &delayed_refs->href_root); - spin_unlock(&delayed_refs->lock); - } else { - actual_count++; - ref->in_tree = 0; - list_del(&ref->list); - if (!list_empty(&ref->add_list)) - list_del(&ref->add_list); + actual_count++; + ref->in_tree = 0; + rb_erase(&ref->ref_node, &locked_ref->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); + /* + * When we play the delayed ref, also correct the ref_mod on + * head + */ + switch (ref->action) { + case BTRFS_ADD_DELAYED_REF: + case BTRFS_ADD_DELAYED_EXTENT: + locked_ref->ref_mod -= ref->ref_mod; + break; + case BTRFS_DROP_DELAYED_REF: + locked_ref->ref_mod += ref->ref_mod; + break; + default: + WARN_ON(1); } atomic_dec(&delayed_refs->num_entries); - if (!btrfs_delayed_ref_is_head(ref)) { - /* - * when we play the delayed ref, also correct the - * ref_mod on head - */ - switch (ref->action) { - case BTRFS_ADD_DELAYED_REF: - case BTRFS_ADD_DELAYED_EXTENT: - locked_ref->node.ref_mod -= ref->ref_mod; - break; - case BTRFS_DROP_DELAYED_REF: - locked_ref->node.ref_mod += ref->ref_mod; - break; - default: - WARN_ON(1); - } - } + /* + * Record the must-insert_reserved flag before we drop the spin + * lock. + */ + must_insert_reserved = locked_ref->must_insert_reserved; + locked_ref->must_insert_reserved = 0; + + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, @@ -2770,33 +2777,13 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, btrfs_free_delayed_extent_op(extent_op); if (ret) { - spin_lock(&delayed_refs->lock); - locked_ref->processing = 0; - delayed_refs->num_heads_ready++; - spin_unlock(&delayed_refs->lock); - btrfs_delayed_ref_unlock(locked_ref); + unselect_delayed_ref_head(delayed_refs, locked_ref); btrfs_put_delayed_ref(ref); btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); return ret; } - /* - * If this node is a head, that means all the refs in this head - * have been dealt with, and we will pick the next head to deal - * with, so we must unlock the head and drop it from the cluster - * list before we release it. - */ - if (btrfs_delayed_ref_is_head(ref)) { - if (locked_ref->is_data && - locked_ref->total_ref_mod < 0) { - spin_lock(&delayed_refs->lock); - delayed_refs->pending_csums -= ref->num_bytes; - spin_unlock(&delayed_refs->lock); - } - btrfs_delayed_ref_unlock(locked_ref); - locked_ref = NULL; - } btrfs_put_delayed_ref(ref); count++; cond_resched(); @@ -3100,33 +3087,16 @@ again: spin_unlock(&delayed_refs->lock); goto out; } + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + refcount_inc(&head->refs); + spin_unlock(&delayed_refs->lock); - while (node) { - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - if (btrfs_delayed_ref_is_head(&head->node)) { - struct btrfs_delayed_ref_node *ref; - - ref = &head->node; - refcount_inc(&ref->refs); - - spin_unlock(&delayed_refs->lock); - /* - * Mutex was contended, block until it's - * released and try again - */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); + /* Mutex was contended, block until it's released and retry. */ + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - cond_resched(); - goto again; - } else { - WARN_ON(1); - } - node = rb_next(node); - } - spin_unlock(&delayed_refs->lock); + btrfs_put_delayed_ref_head(head); cond_resched(); goto again; } @@ -3169,6 +3139,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_transaction *cur_trans; + struct rb_node *node; int ret = 0; cur_trans = root->fs_info->running_transaction; @@ -3184,7 +3155,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { - refcount_inc(&head->node.refs); + refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -3195,13 +3166,18 @@ static noinline int check_delayed_ref(struct btrfs_root *root, */ mutex_lock(&head->mutex); mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return -EAGAIN; } spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); - list_for_each_entry(ref, &head->ref_list, list) { + /* + * XXX: We should replace this with a proper search function in the + * future. + */ + for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; @@ -3351,7 +3327,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, - struct btrfs_fs_info *, + struct btrfs_root *, u64, u64, u64, u64, u64, u64); @@ -3391,7 +3367,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); - ret = process_func(trans, fs_info, bytenr, num_bytes, + ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, key.offset); if (ret) @@ -3399,7 +3375,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = fs_info->nodesize; - ret = process_func(trans, fs_info, bytenr, num_bytes, + ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0); if (ret) goto fail; @@ -4843,7 +4819,6 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, u64 orig, bool wait_ordered) { - struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; @@ -4859,8 +4834,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, to_reclaim = items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; - block_rsv = &fs_info->delalloc_block_rsv; - space_info = block_rsv->space_info; + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -4919,6 +4893,13 @@ skip_async: } } +struct reserve_ticket { + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; +}; + /** * maybe_commit_transaction - possibly commit the transaction if its ok to * @root - the root we're allocating for @@ -4930,18 +4911,29 @@ skip_async: * will return -ENOSPC. */ static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 bytes, int force) + struct btrfs_space_info *space_info) { + struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; struct btrfs_trans_handle *trans; + u64 bytes; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) return -EAGAIN; - if (force) - goto commit; + spin_lock(&space_info->lock); + if (!list_empty(&space_info->priority_tickets)) + ticket = list_first_entry(&space_info->priority_tickets, + struct reserve_ticket, list); + else if (!list_empty(&space_info->tickets)) + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + bytes = (ticket) ? ticket->bytes : 0; + spin_unlock(&space_info->lock); + + if (!bytes) + return 0; /* See if there is enough pinned space to make this reservation */ if (percpu_counter_compare(&space_info->total_bytes_pinned, @@ -4956,8 +4948,12 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, return -ENOSPC; spin_lock(&delayed_rsv->lock); + if (delayed_rsv->size > bytes) + bytes = 0; + else + bytes -= delayed_rsv->size; if (percpu_counter_compare(&space_info->total_bytes_pinned, - bytes - delayed_rsv->size) < 0) { + bytes) < 0) { spin_unlock(&delayed_rsv->lock); return -ENOSPC; } @@ -4971,13 +4967,6 @@ commit: return btrfs_commit_transaction(trans); } -struct reserve_ticket { - u64 bytes; - int error; - struct list_head list; - wait_queue_head_t wait; -}; - /* * Try to flush some data based on policy set by @state. This is only advisory * and may fail for various reasons. The caller is supposed to examine the @@ -5027,8 +5016,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = 0; break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info, - num_bytes, 0); + ret = may_commit_transaction(fs_info, space_info); break; default: ret = -ENOSPC; @@ -5582,11 +5570,12 @@ again: } } -static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, +static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) { struct btrfs_space_info *space_info = block_rsv->space_info; + u64 ret; spin_lock(&block_rsv->lock); if (num_bytes == (u64)-1) @@ -5601,6 +5590,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } spin_unlock(&block_rsv->lock); + ret = num_bytes; if (num_bytes > 0) { if (dest) { spin_lock(&dest->lock); @@ -5620,6 +5610,7 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, space_info_add_old_bytes(fs_info, space_info, num_bytes); } + return ret; } int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, @@ -5643,6 +5634,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) rsv->type = type; } +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv, + unsigned short type) +{ + btrfs_init_block_rsv(rsv, type); + rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); +} + struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type) { @@ -5652,9 +5652,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, if (!block_rsv) return NULL; - btrfs_init_block_rsv(block_rsv, type); - block_rsv->space_info = __find_space_info(fs_info, - BTRFS_BLOCK_GROUP_METADATA); + btrfs_init_metadata_block_rsv(fs_info, block_rsv, type); return block_rsv; } @@ -5737,6 +5735,66 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } +/** + * btrfs_inode_rsv_refill - refill the inode block rsv. + * @inode - the inode we are refilling. + * @flush - the flusing restriction. + * + * Essentially the same as btrfs_block_rsv_refill, except it uses the + * block_rsv->size as the minimum size. We'll either refill the missing amount + * or return if we already have enough space. This will also handle the resreve + * tracepoint for the reserved amount. + */ +int btrfs_inode_rsv_refill(struct btrfs_inode *inode, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_root *root = inode->root; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 num_bytes = 0; + int ret = -ENOSPC; + + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) + num_bytes = block_rsv->size - block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (num_bytes == 0) + return 0; + + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_ino(inode), num_bytes, 1); + } + return ret; +} + +/** + * btrfs_inode_rsv_release - release any excessive reservation. + * @inode - the inode we need to release from. + * + * This is the same as btrfs_block_rsv_release, except that it handles the + * tracepoint for the reservation. + */ +void btrfs_inode_rsv_release(struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 released = 0; + + /* + * Since we statically set the block_rsv->size we just want to say we + * are releasing 0 bytes, and then we'll just get the reservation over + * the size free'd. + */ + released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delalloc", + btrfs_ino(inode), released, 0); +} + void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes) @@ -5808,7 +5866,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info) space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); fs_info->global_block_rsv.space_info = space_info; - fs_info->delalloc_block_rsv.space_info = space_info; fs_info->trans_block_rsv.space_info = space_info; fs_info->empty_block_rsv.space_info = space_info; fs_info->delayed_block_rsv.space_info = space_info; @@ -5828,8 +5885,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) { block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL, (u64)-1); - WARN_ON(fs_info->delalloc_block_rsv.size > 0); - WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); WARN_ON(fs_info->trans_block_rsv.size > 0); WARN_ON(fs_info->trans_block_rsv.reserved > 0); WARN_ON(fs_info->chunk_block_rsv.size > 0); @@ -5841,12 +5896,15 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - if (!trans->block_rsv) + if (!trans->block_rsv) { + ASSERT(!trans->bytes_reserved); return; + } if (!trans->bytes_reserved) return; + ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, @@ -5968,104 +6026,37 @@ void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, btrfs_block_rsv_release(fs_info, rsv, (u64)-1); } -/** - * drop_outstanding_extent - drop an outstanding extent - * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're releasing. - * - * This is called when we are freeing up an outstanding extent, either called - * after an error or after an extent is written. This will return the number of - * reserved extents that need to be freed. This must be called with - * BTRFS_I(inode)->lock held. - */ -static unsigned drop_outstanding_extent(struct btrfs_inode *inode, - u64 num_bytes) -{ - unsigned drop_inode_space = 0; - unsigned dropped_extents = 0; - unsigned num_extents; - - num_extents = count_max_extents(num_bytes); - ASSERT(num_extents); - ASSERT(inode->outstanding_extents >= num_extents); - inode->outstanding_extents -= num_extents; - - if (inode->outstanding_extents == 0 && - test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) - drop_inode_space = 1; - - /* - * If we have more or the same amount of outstanding extents than we have - * reserved then we need to leave the reserved extents count alone. - */ - if (inode->outstanding_extents >= inode->reserved_extents) - return drop_inode_space; - - dropped_extents = inode->reserved_extents - inode->outstanding_extents; - inode->reserved_extents -= dropped_extents; - return dropped_extents + drop_inode_space; -} - -/** - * calc_csum_metadata_size - return the amount of metadata space that must be - * reserved/freed for the given bytes. - * @inode: the inode we're manipulating - * @num_bytes: the number of bytes in question - * @reserve: 1 if we are reserving space, 0 if we are freeing space - * - * This adjusts the number of csum_bytes in the inode and then returns the - * correct amount of metadata that must either be reserved or freed. We - * calculate how many checksums we can fit into one leaf and then divide the - * number of bytes that will need to be checksumed by this value to figure out - * how many checksums will be required. If we are adding bytes then the number - * may go up and we will return the number of additional bytes that must be - * reserved. If it is going down we will return the number of bytes that must - * be freed. - * - * This must be called with BTRFS_I(inode)->lock held. - */ -static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes, - int reserve) +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - u64 old_csums, num_csums; - - if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0) - return 0; - - old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); - if (reserve) - inode->csum_bytes += num_bytes; - else - inode->csum_bytes -= num_bytes; - num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); - - /* No change, no need to reserve more */ - if (old_csums == num_csums) - return 0; + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 reserve_size = 0; + u64 csum_leaves; + unsigned outstanding_extents; - if (reserve) - return btrfs_calc_trans_metadata_size(fs_info, - num_csums - old_csums); + lockdep_assert_held(&inode->lock); + outstanding_extents = inode->outstanding_extents; + if (outstanding_extents) + reserve_size = btrfs_calc_trans_metadata_size(fs_info, + outstanding_extents + 1); + csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, + inode->csum_bytes); + reserve_size += btrfs_calc_trans_metadata_size(fs_info, + csum_leaves); - return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); + spin_lock(&block_rsv->lock); + block_rsv->size = reserve_size; + spin_unlock(&block_rsv->lock); } int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); struct btrfs_root *root = inode->root; - struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; - u64 to_reserve = 0; - u64 csum_bytes; unsigned nr_extents; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; - u64 to_free = 0; - unsigned dropped; - bool release_extra = false; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc @@ -6091,19 +6082,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + /* Add our new extents and calculate the new rsv size. */ spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); - inode->outstanding_extents += nr_extents; - - nr_extents = 0; - if (inode->outstanding_extents > inode->reserved_extents) - nr_extents += inode->outstanding_extents - - inode->reserved_extents; - - /* We always want to reserve a slot for updating the inode. */ - to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); - to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); - csum_bytes = inode->csum_bytes; + btrfs_mod_outstanding_extents(inode, nr_extents); + inode->csum_bytes += num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { @@ -6113,92 +6097,26 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) goto out_fail; } - ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); + ret = btrfs_inode_rsv_refill(inode, flush); if (unlikely(ret)) { btrfs_qgroup_free_meta(root, nr_extents * fs_info->nodesize); goto out_fail; } - spin_lock(&inode->lock); - if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &inode->runtime_flags)) { - to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); - release_extra = true; - } - inode->reserved_extents += nr_extents; - spin_unlock(&inode->lock); - if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); - - if (to_reserve) - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), to_reserve, 1); - if (release_extra) - btrfs_block_rsv_release(fs_info, block_rsv, - btrfs_calc_trans_metadata_size(fs_info, 1)); return 0; out_fail: spin_lock(&inode->lock); - dropped = drop_outstanding_extent(inode, num_bytes); - /* - * If the inodes csum_bytes is the same as the original - * csum_bytes then we know we haven't raced with any free()ers - * so we can just reduce our inodes csum bytes and carry on. - */ - if (inode->csum_bytes == csum_bytes) { - calc_csum_metadata_size(inode, num_bytes, 0); - } else { - u64 orig_csum_bytes = inode->csum_bytes; - u64 bytes; - - /* - * This is tricky, but first we need to figure out how much we - * freed from any free-ers that occurred during this - * reservation, so we reset ->csum_bytes to the csum_bytes - * before we dropped our lock, and then call the free for the - * number of bytes that were freed while we were trying our - * reservation. - */ - bytes = csum_bytes - inode->csum_bytes; - inode->csum_bytes = csum_bytes; - to_free = calc_csum_metadata_size(inode, bytes, 0); - - - /* - * Now we need to see how much we would have freed had we not - * been making this reservation and our ->csum_bytes were not - * artificially inflated. - */ - inode->csum_bytes = csum_bytes - num_bytes; - bytes = csum_bytes - orig_csum_bytes; - bytes = calc_csum_metadata_size(inode, bytes, 0); - - /* - * Now reset ->csum_bytes to what it should be. If bytes is - * more than to_free then we would have freed more space had we - * not had an artificially high ->csum_bytes, so we need to free - * the remainder. If bytes is the same or less then we don't - * need to do anything, the other free-ers did the correct - * thing. - */ - inode->csum_bytes = orig_csum_bytes - num_bytes; - if (bytes > to_free) - to_free = bytes - to_free; - else - to_free = 0; - } + nr_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -nr_extents); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - if (dropped) - to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); - if (to_free) { - btrfs_block_rsv_release(fs_info, block_rsv, to_free); - trace_btrfs_space_reservation(fs_info, "delalloc", - btrfs_ino(inode), to_free, 0); - } + btrfs_inode_rsv_release(inode); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); return ret; @@ -6206,36 +6124,55 @@ out_fail: /** * btrfs_delalloc_release_metadata - release a metadata reservation for an inode - * @inode: the inode to release the reservation for - * @num_bytes: the number of bytes we're releasing + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata - * reservations. + * reservations, or on error for the same reason. */ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); - u64 to_free = 0; - unsigned dropped; num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&inode->lock); - dropped = drop_outstanding_extent(inode, num_bytes); - - if (num_bytes) - to_free = calc_csum_metadata_size(inode, num_bytes, 0); + inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); - if (dropped > 0) - to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); if (btrfs_is_testing(fs_info)) return; - trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), - to_free, 0); + btrfs_inode_rsv_release(inode); +} + +/** + * btrfs_delalloc_release_extents - release our outstanding_extents + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with + * + * When we reserve space we increase outstanding_extents for the extents we may + * add. Once we've set the range as delalloc or created our ordered extents we + * have outstanding_extents to track the real usage, so we use this to free our + * temporarily tracked outstanding_extents. This _must_ be used in conjunction + * with btrfs_delalloc_reserve_metadata. + */ +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb); + unsigned num_extents; + + spin_lock(&inode->lock); + num_extents = count_max_extents(num_bytes); + btrfs_mod_outstanding_extents(inode, -num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; - btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); + btrfs_inode_rsv_release(inode); } /** @@ -6282,10 +6219,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * @inode: inode we're releasing space for * @start: start position of the space already reserved * @len: the len of the space already reserved - * - * This must be matched with a call to btrfs_delalloc_reserve_space. This is - * called in the case that we don't need the metadata AND data reservations - * anymore. So if there is an error or we insert an inline extent. + * @release_bytes: the len of the space we consumed or didn't use * * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes @@ -6293,7 +6227,8 @@ int btrfs_delalloc_reserve_space(struct inode *inode, * Also it will handle the qgroup reserved space. */ void btrfs_delalloc_release_space(struct inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len) { btrfs_delalloc_release_metadata(BTRFS_I(inode), len); btrfs_free_reserved_data_space(inode, reserved, start, len); @@ -6958,7 +6893,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (is_data) - skinny_metadata = 0; + skinny_metadata = false; ret = lookup_extent_backref(trans, info, path, &iref, bytenr, num_bytes, parent, @@ -7213,7 +7148,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (!list_empty(&head->ref_list)) + if (!RB_EMPTY_ROOT(&head->ref_tree)) goto out; if (head->extent_op) { @@ -7234,9 +7169,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * at this point we have a head with no other entries. Go * ahead and process it. */ - head->node.in_tree = 0; rb_erase(&head->href_node, &delayed_refs->href_root); - + RB_CLEAR_NODE(&head->href_node); atomic_dec(&delayed_refs->num_entries); /* @@ -7255,7 +7189,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, ret = 1; mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(&head->node); + btrfs_put_delayed_ref_head(head); return ret; out: spin_unlock(&head->lock); @@ -7277,6 +7211,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { int old_ref_mod, new_ref_mod; + btrfs_ref_tree_mod(root, buf->start, buf->len, parent, + root->root_key.objectid, + btrfs_header_level(buf), 0, + BTRFS_DROP_DELAYED_REF); ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start, buf->len, parent, root->root_key.objectid, @@ -7329,16 +7267,21 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, + struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { + struct btrfs_fs_info *fs_info = root->fs_info; int old_ref_mod, new_ref_mod; int ret; if (btrfs_is_testing(fs_info)) return 0; + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) + btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, + root_objectid, owner, offset, + BTRFS_DROP_DELAYED_REF); /* * tree log blocks never actually go into the extent allocation @@ -8306,17 +8249,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, + struct btrfs_root *root, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { - struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_fs_info *fs_info = root->fs_info; int ret; - BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + + btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0, + root->root_key.objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT); ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, - ins->offset, 0, root_objectid, owner, + ins->offset, 0, + root->root_key.objectid, owner, offset, ram_bytes, BTRFS_ADD_DELAYED_EXTENT, NULL, NULL); return ret; @@ -8538,6 +8486,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; + btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent, + root_objectid, level, 0, + BTRFS_ADD_DELAYED_EXTENT); ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, ins.offset, parent, root_objectid, level, @@ -8894,7 +8845,7 @@ skip: ret); } } - ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0); if (ret) @@ -9311,7 +9262,7 @@ out: * don't have it in the radix (like when we recover after a power fail * or unmount) so we don't leak memory. */ - if (!for_reloc && root_dropped == false) + if (!for_reloc && !root_dropped) btrfs_add_dead_root(root); if (err && err != -EAGAIN) btrfs_handle_fs_error(fs_info, err, NULL); @@ -9968,9 +9919,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) return 0; } -static void __link_block_group(struct btrfs_space_info *space_info, - struct btrfs_block_group_cache *cache) +static void link_block_group(struct btrfs_block_group_cache *cache) { + struct btrfs_space_info *space_info = cache->space_info; int index = get_block_group_index(cache); bool first = false; @@ -10178,7 +10129,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) cache->space_info = space_info; - __link_block_group(space_info, cache); + link_block_group(cache); set_avail_alloc_bits(info, cache->flags); if (btrfs_chunk_readonly(info, cache->key.objectid)) { @@ -10337,7 +10288,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->bytes_super, &cache->space_info); update_global_block_rsv(fs_info); - __link_block_group(cache->space_info, cache); + link_block_group(cache); list_add_tail(&cache->bg_list, &trans->new_bgs); @@ -10387,6 +10338,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * remove it. */ free_excluded_extents(fs_info, block_group); + btrfs_free_ref_tree_range(fs_info, block_group->key.objectid, + block_group->key.offset); memcpy(&key, &block_group->key, sizeof(key)); index = get_block_group_index(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3e5bb0cdd3cd..adbbc017191c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/bitops.h> #include <linux/slab.h> #include <linux/bio.h> @@ -109,7 +110,6 @@ struct extent_page_data { struct bio *bio; struct extent_io_tree *tree; get_extent_t *get_extent; - unsigned long bio_flags; /* tells writepage not to lock the state bits for this range * it still does the unlocking @@ -2761,8 +2761,8 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page, */ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct writeback_control *wbc, - struct page *page, sector_t sector, - size_t size, unsigned long offset, + struct page *page, u64 offset, + size_t size, unsigned long pg_offset, struct block_device *bdev, struct bio **bio_ret, bio_end_io_t end_io_func, @@ -2776,6 +2776,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, int contig = 0; int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; size_t page_size = min_t(size_t, size, PAGE_SIZE); + sector_t sector = offset >> 9; if (bio_ret && *bio_ret) { bio = *bio_ret; @@ -2786,8 +2787,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, if (prev_bio_flags != bio_flags || !contig || force_bio_submit || - merge_bio(tree, page, offset, page_size, bio, bio_flags) || - bio_add_page(bio, page, page_size, offset) < page_size) { + merge_bio(tree, page, pg_offset, page_size, bio, bio_flags) || + bio_add_page(bio, page, page_size, pg_offset) < page_size) { ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; @@ -2801,8 +2802,8 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, sector << 9); - bio_add_page(bio, page, page_size, offset); + bio = btrfs_bio_alloc(bdev, offset); + bio_add_page(bio, page, page_size, pg_offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; @@ -2892,7 +2893,6 @@ static int __do_readpage(struct extent_io_tree *tree, u64 last_byte = i_size_read(inode); u64 block_start; u64 cur_end; - sector_t sector; struct extent_map *em; struct block_device *bdev; int ret = 0; @@ -2928,6 +2928,7 @@ static int __do_readpage(struct extent_io_tree *tree, } while (cur <= end) { bool force_bio_submit = false; + u64 offset; if (cur >= last_byte) { char *userpage; @@ -2967,9 +2968,9 @@ static int __do_readpage(struct extent_io_tree *tree, iosize = ALIGN(iosize, blocksize); if (this_bio_flag & EXTENT_BIO_COMPRESSED) { disk_io_size = em->block_len; - sector = em->block_start >> 9; + offset = em->block_start; } else { - sector = (em->block_start + extent_offset) >> 9; + offset = em->block_start + extent_offset; disk_io_size = iosize; } bdev = em->bdev; @@ -3062,8 +3063,8 @@ static int __do_readpage(struct extent_io_tree *tree, } ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, - page, sector, disk_io_size, pg_offset, - bdev, bio, + page, offset, disk_io_size, + pg_offset, bdev, bio, end_bio_extent_readpage, mirror_num, *bio_flags, this_bio_flag, @@ -3324,7 +3325,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, u64 extent_offset; u64 block_start; u64 iosize; - sector_t sector; struct extent_map *em; struct block_device *bdev; size_t pg_offset = 0; @@ -3367,6 +3367,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, while (cur <= end) { u64 em_end; + u64 offset; if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) @@ -3388,7 +3389,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, BUG_ON(end < cur); iosize = min(em_end - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); - sector = (em->block_start + extent_offset) >> 9; + offset = em->block_start + extent_offset; bdev = em->bdev; block_start = em->block_start; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -3431,7 +3432,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, } ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, - page, sector, iosize, pg_offset, + page, offset, iosize, pg_offset, bdev, &epd->bio, end_bio_extent_writepage, 0, 0, 0, false); @@ -3471,8 +3472,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned int write_flags = 0; unsigned long nr_written = 0; - if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = REQ_SYNC; + write_flags = wbc_to_write_flags(wbc); trace___extent_writepage(page, inode, wbc); @@ -3716,16 +3716,13 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; u32 nritems; unsigned long i, num_pages; - unsigned long bio_flags = 0; unsigned long start, end; - unsigned int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; + unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); atomic_set(&eb->io_pages, num_pages); - if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) - bio_flags = EXTENT_BIO_TREE_LOG; /* set btree blocks beyond nritems with 0 to avoid stale content. */ nritems = btrfs_header_nritems(eb); @@ -3749,11 +3746,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, - p, offset >> 9, PAGE_SIZE, 0, bdev, + p, offset, PAGE_SIZE, 0, bdev, &epd->bio, end_bio_extent_buffer_writepage, - 0, epd->bio_flags, bio_flags, false); - epd->bio_flags = bio_flags; + 0, 0, 0, false); if (ret) { set_btree_ioerr(p); if (PageWriteback(p)) @@ -3790,7 +3786,6 @@ int btree_write_cache_pages(struct address_space *mapping, .tree = tree, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - .bio_flags = 0, }; int ret = 0; int done = 0; @@ -4063,10 +4058,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) if (epd->bio) { int ret; - bio_set_op_attrs(epd->bio, REQ_OP_WRITE, - epd->sync_io ? REQ_SYNC : 0); - - ret = submit_one_bio(epd->bio, 0, epd->bio_flags); + ret = submit_one_bio(epd->bio, 0, 0); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -4089,7 +4081,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - .bio_flags = 0, }; ret = __extent_writepage(page, wbc, &epd); @@ -4114,7 +4105,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .get_extent = get_extent, .extent_locked = 1, .sync_io = mode == WB_SYNC_ALL, - .bio_flags = 0, }; struct writeback_control wbc_writepages = { .sync_mode = mode, @@ -4154,7 +4144,6 @@ int extent_writepages(struct extent_io_tree *tree, .get_extent = get_extent, .extent_locked = 0, .sync_io = wbc->sync_mode == WB_SYNC_ALL, - .bio_flags = 0, }; ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index faffa28ba707..4a8861379d3e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __EXTENTIO__ #define __EXTENTIO__ @@ -33,7 +34,6 @@ * type for this bio */ #define EXTENT_BIO_COMPRESSED 1 -#define EXTENT_BIO_TREE_LOG 2 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 69850155870c..2e348fb0b280 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index a67b2def5413..64365bbc9b16 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __EXTENTMAP__ #define __EXTENTMAP__ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index aafcc785f840..f80254d82f40 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -856,7 +856,7 @@ next_slot: btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) { - ret = btrfs_inc_extent_ref(trans, fs_info, + ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, @@ -940,7 +940,7 @@ delete_extent_item: extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - ret = btrfs_free_extent(trans, fs_info, + ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - @@ -1234,7 +1234,7 @@ again: extent_end - split); btrfs_mark_buffer_dirty(leaf); - ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes, + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1268,7 +1268,7 @@ again: extent_end = other_end; del_slot = path->slots[0] + 1; del_nr++; - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1288,7 +1288,7 @@ again: key.offset = other_start; del_slot = path->slots[0]; del_nr++; - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1590,7 +1590,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, int ret = 0; bool only_release_metadata = false; bool force_page_uptodate = false; - bool need_unlock; nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE), PAGE_SIZE / (sizeof(struct page *))); @@ -1613,6 +1612,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, size_t copied; size_t dirty_sectors; size_t num_sectors; + int extents_locked; WARN_ON(num_pages > nrptrs); @@ -1656,6 +1656,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } } + WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes); if (ret) { @@ -1669,7 +1670,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } release_bytes = reserve_bytes; - need_unlock = false; again: /* * This is going to setup the pages array with the number of @@ -1679,19 +1679,23 @@ again: ret = prepare_pages(inode, pages, num_pages, pos, write_bytes, force_page_uptodate); - if (ret) + if (ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), + reserve_bytes); break; + } - ret = lock_and_cleanup_extent_if_need(BTRFS_I(inode), pages, + extents_locked = lock_and_cleanup_extent_if_need( + BTRFS_I(inode), pages, num_pages, pos, write_bytes, &lockstart, &lockend, &cached_state); - if (ret < 0) { - if (ret == -EAGAIN) + if (extents_locked < 0) { + if (extents_locked == -EAGAIN) goto again; + btrfs_delalloc_release_extents(BTRFS_I(inode), + reserve_bytes); + ret = extents_locked; break; - } else if (ret > 0) { - need_unlock = true; - ret = 0; } copied = btrfs_copy_from_user(pos, write_bytes, pages, i); @@ -1718,23 +1722,10 @@ again: PAGE_SIZE); } - /* - * If we had a short copy we need to release the excess delaloc - * bytes we reserved. We need to increment outstanding_extents - * because btrfs_delalloc_release_space and - * btrfs_delalloc_release_metadata will decrement it, but - * we still have an outstanding extent for the chunk we actually - * managed to copy. - */ if (num_sectors > dirty_sectors) { /* release everything except the sectors we dirtied */ release_bytes -= dirty_sectors << fs_info->sb->s_blocksize_bits; - if (copied > 0) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } if (only_release_metadata) { btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); @@ -1756,10 +1747,11 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, pos, copied, NULL); - if (need_unlock) + if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { btrfs_drop_pages(pages, num_pages); break; @@ -2046,7 +2038,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; - bool full_sync = 0; + bool full_sync = false; u64 len; /* diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 684f12247db7..fe5e0324dca9 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1286,12 +1286,8 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { - u64 start, end; int ret; - start = block_group->key.objectid; - end = block_group->key.objectid + block_group->key.offset; - block_group->needs_free_space = 0; ret = add_new_free_space_info(trans, fs_info, block_group, path); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d02019747d00..022b19336fee 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -500,11 +500,12 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); goto out_put; } ret = btrfs_write_out_ino_cache(root, trans, path, inode); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index c8e864b2d530..6734ec92a1e9 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __BTRFS_INODE_MAP #define __BTRFS_INODE_MAP diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 128f3e58634f..b93fe05a39c7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -42,6 +42,7 @@ #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> #include <linux/uio.h> +#include <linux/magic.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -67,7 +68,6 @@ struct btrfs_iget_args { }; struct btrfs_dio_data { - u64 outstanding_extents; u64 reserve; u64 unsubmitted_oe_range_start; u64 unsubmitted_oe_range_end; @@ -135,6 +135,18 @@ static inline void btrfs_cleanup_ordered_extents(struct inode *inode, const u64 offset, const u64 bytes) { + unsigned long index = offset >> PAGE_SHIFT; + unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + index++; + if (!page) + continue; + ClearPagePrivate2(page); + put_page(page); + } return __endio_write_update_ordered(inode, offset + PAGE_SIZE, bytes - PAGE_SIZE, false); } @@ -304,7 +316,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_free_path(path); return PTR_ERR(trans); } - trans->block_rsv = &fs_info->delalloc_block_rsv; + trans->block_rsv = &BTRFS_I(inode)->block_rsv; if (compressed_size && compressed_pages) extent_item_size = btrfs_file_extent_calc_inline_size( @@ -336,7 +348,6 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, } set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start); btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0); out: /* @@ -446,7 +457,6 @@ static noinline void compress_file_range(struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes; u64 blocksize = fs_info->sectorsize; u64 actual_end; u64 isize = i_size_read(inode); @@ -496,8 +506,6 @@ again: total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); - num_bytes = ALIGN(end - start + 1, blocksize); - num_bytes = max(blocksize, num_bytes); total_in = 0; ret = 0; @@ -530,7 +538,10 @@ again: */ extent_range_clear_dirty_for_io(inode, start, end); redirty = 1; - ret = btrfs_compress_pages(compress_type, + + /* Compression level is applied here and only here */ + ret = btrfs_compress_pages( + compress_type | (fs_info->compress_level << 4), inode->i_mapping, start, pages, &nr_pages, @@ -558,7 +569,7 @@ again: cont: if (start == 0) { /* lets try to make an inline extent */ - if (ret || total_in < (actual_end - start)) { + if (ret || total_in < actual_end) { /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ @@ -572,16 +583,21 @@ cont: } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING; unsigned long page_error_op; - clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; /* * inline extent creation worked or returned error, * we don't need to create any more async work items. * Unlock and free up our temp pages. + * + * We use DO_ACCOUNTING here because we need the + * delalloc_release_metadata to be done _after_ we drop + * our outstanding extent for clearing delalloc for this + * range. */ extent_clear_unlock_delalloc(inode, start, end, end, NULL, clear_flags, @@ -590,10 +606,6 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - if (ret == 0) - btrfs_free_reserved_data_space_noquota(inode, - start, - end - start + 1); goto free_pages_out; } } @@ -613,7 +625,6 @@ cont: */ total_in = ALIGN(total_in, PAGE_SIZE); if (total_compressed + blocksize <= total_in) { - num_bytes = total_in; *num_added += 1; /* @@ -621,12 +632,12 @@ cont: * allocation on disk for these compressed pages, and * will submit them to the elevator. */ - add_async_extent(async_cow, start, num_bytes, + add_async_extent(async_cow, start, total_in, total_compressed, pages, nr_pages, compress_type); - if (start + num_bytes < end) { - start += num_bytes; + if (start + total_in < end) { + start += total_in; pages = NULL; cond_resched(); goto again; @@ -970,15 +981,19 @@ static noinline int cow_file_range(struct inode *inode, ret = cow_file_range_inline(root, inode, start, end, 0, BTRFS_COMPRESS_NONE, NULL); if (ret == 0) { + /* + * We use DO_ACCOUNTING here because we need the + * delalloc_release_metadata to be run _after_ we drop + * our outstanding extent for clearing delalloc for this + * range. + */ extent_clear_unlock_delalloc(inode, start, end, delalloc_end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG, PAGE_UNLOCK | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); - btrfs_free_reserved_data_space_noquota(inode, start, - end - start + 1); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -1214,13 +1229,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); - while (atomic_read(&fs_info->async_submit_draining) && - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->async_delalloc_pages) == - 0)); - } - *nr_written += nr_pages; start = cur_end + 1; } @@ -1623,7 +1631,7 @@ static void btrfs_split_extent_hook(void *private_data, } spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); spin_unlock(&BTRFS_I(inode)->lock); } @@ -1653,7 +1661,7 @@ static void btrfs_merge_extent_hook(void *private_data, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= BTRFS_MAX_EXTENT_SIZE) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); return; } @@ -1684,7 +1692,7 @@ static void btrfs_merge_extent_hook(void *private_data, return; spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); } @@ -1754,15 +1762,12 @@ static void btrfs_set_bit_hook(void *private_data, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; + u32 num_extents = count_max_extents(len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); - if (*bits & EXTENT_FIRST_DELALLOC) { - *bits &= ~EXTENT_FIRST_DELALLOC; - } else { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - } + spin_lock(&BTRFS_I(inode)->lock); + btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); + spin_unlock(&BTRFS_I(inode)->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) @@ -1816,13 +1821,9 @@ static void btrfs_clear_bit_hook(void *private_data, struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); - if (*bits & EXTENT_FIRST_DELALLOC) { - *bits &= ~EXTENT_FIRST_DELALLOC; - } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { - spin_lock(&inode->lock); - inode->outstanding_extents -= num_extents; - spin_unlock(&inode->lock); - } + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -num_extents); + spin_unlock(&inode->lock); /* * We don't reserve metadata space for space cache inodes so we @@ -2093,6 +2094,7 @@ again: 0); ClearPageChecked(page); set_page_dirty(page); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state, GFP_NOFS); @@ -2217,8 +2219,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret < 0) goto out; qg_released = ret; - ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), file_pos, qg_released, &ins); + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(BTRFS_I(inode)), + file_pos, qg_released, &ins); out: btrfs_free_path(path); @@ -2452,7 +2455,7 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path, ret = iterate_inodes_from_logical(old->bytenr + old->extent_offset, fs_info, path, record_one_backref, - old); + old, false); if (ret < 0 && ret != -ENOENT) return false; @@ -2670,7 +2673,7 @@ again: inode_add_bytes(inode, len); btrfs_release_path(path); - ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr, + ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, backref->root_id, backref->inum, new->file_pos); /* start - extent_offset */ @@ -2952,7 +2955,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out; } - trans->block_rsv = &fs_info->delalloc_block_rsv; + trans->block_rsv = &BTRFS_I(inode)->block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); @@ -2988,7 +2991,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - trans->block_rsv = &fs_info->delalloc_block_rsv; + trans->block_rsv = &BTRFS_I(inode)->block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -3046,9 +3049,6 @@ out: 0, &cached_state, GFP_NOFS); } - if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(BTRFS_I(inode), - ordered_extent->len); if (trans) btrfs_end_transaction(trans); @@ -4360,47 +4360,11 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, } -static int truncate_inline_extent(struct inode *inode, - struct btrfs_path *path, - struct btrfs_key *found_key, - const u64 item_end, - const u64 new_size) -{ - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - struct btrfs_file_extent_item *fi; - u32 size = (u32)(new_size - found_key->offset); - struct btrfs_root *root = BTRFS_I(inode)->root; - - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - - if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { - loff_t offset = new_size; - loff_t page_end = ALIGN(offset, PAGE_SIZE); - - /* - * Zero out the remaining of the last page of our inline extent, - * instead of directly truncating our inline extent here - that - * would be much more complex (decompressing all the data, then - * compressing the truncated data, which might be bigger than - * the size of the inline extent, resize the extent, etc). - * We release the path because to get the page we might need to - * read the extent item from disk (data not in the page cache). - */ - btrfs_release_path(path); - return btrfs_truncate_block(inode, offset, page_end - offset, - 0); - } - - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root->fs_info, path, size, 1); - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - inode_sub_bytes(inode, item_end + 1 - new_size); - - return 0; -} +/* + * Return this if we need to call truncate_block for the last bit of the + * truncate. + */ +#define NEED_TRUNCATE_BLOCK 1 /* * this can truncate away extent items, csum items and directory items. @@ -4439,9 +4403,9 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int err = 0; u64 ino = btrfs_ino(BTRFS_I(inode)); u64 bytes_deleted = 0; - bool be_nice = 0; - bool should_throttle = 0; - bool should_end = 0; + bool be_nice = false; + bool should_throttle = false; + bool should_end = false; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); @@ -4451,7 +4415,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, */ if (!btrfs_is_free_space_inode(BTRFS_I(inode)) && test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - be_nice = 1; + be_nice = true; path = btrfs_alloc_path(); if (!path) @@ -4561,11 +4525,6 @@ search_again: if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; - if (del_item) - last_size = found_key.offset; - else - last_size = new_size; - if (extent_type != BTRFS_FILE_EXTENT_INLINE) { u64 num_dec; extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); @@ -4607,40 +4566,30 @@ search_again: */ if (!del_item && btrfs_file_extent_encryption(leaf, fi) == 0 && - btrfs_file_extent_other_encoding(leaf, fi) == 0) { - + btrfs_file_extent_other_encoding(leaf, fi) == 0 && + btrfs_file_extent_compression(leaf, fi) == 0) { + u32 size = (u32)(new_size - found_key.offset); + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(root->fs_info, path, size, 1); + } else if (!del_item) { /* - * Need to release path in order to truncate a - * compressed extent. So delete any accumulated - * extent items so far. + * We have to bail so the last_size is set to + * just before this extent. */ - if (btrfs_file_extent_compression(leaf, fi) != - BTRFS_COMPRESS_NONE && pending_del_nr) { - err = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - if (err) { - btrfs_abort_transaction(trans, - err); - goto error; - } - pending_del_nr = 0; - } + err = NEED_TRUNCATE_BLOCK; + break; + } - err = truncate_inline_extent(inode, path, - &found_key, - item_end, - new_size); - if (err) { - btrfs_abort_transaction(trans, err); - goto error; - } - } else if (test_bit(BTRFS_ROOT_REF_COWS, - &root->state)) { + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); - } } delete: + if (del_item) + last_size = found_key.offset; + else + last_size = new_size; if (del_item) { if (!pending_del_nr) { /* no pending yet, add ourselves */ @@ -4657,14 +4606,14 @@ delete: } else { break; } - should_throttle = 0; + should_throttle = false; if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == fs_info->tree_root)) { btrfs_set_path_blocking(path); bytes_deleted += extent_num_bytes; - ret = btrfs_free_extent(trans, fs_info, extent_start, + ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset); @@ -4676,11 +4625,11 @@ delete: if (be_nice) { if (truncate_space_check(trans, root, extent_num_bytes)) { - should_end = 1; + should_end = true; } if (btrfs_should_throttle_delayed_refs(trans, fs_info)) - should_throttle = 1; + should_throttle = true; } } @@ -4789,8 +4738,11 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, (!len || ((len & (blocksize - 1)) == 0))) goto out; + block_start = round_down(from, blocksize); + block_end = block_start + blocksize - 1; + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, - round_down(from, blocksize), blocksize); + block_start, blocksize); if (ret) goto out; @@ -4798,15 +4750,12 @@ again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, data_reserved, - round_down(from, blocksize), - blocksize); + block_start, blocksize); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; goto out; } - block_start = round_down(from, blocksize); - block_end = block_start + blocksize - 1; - if (!PageUptodate(page)) { ret = btrfs_readpage(NULL, page); lock_page(page); @@ -4871,6 +4820,7 @@ out_unlock: if (ret) btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: @@ -7785,33 +7735,6 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, return em; } -static void adjust_dio_outstanding_extents(struct inode *inode, - struct btrfs_dio_data *dio_data, - const u64 len) -{ - unsigned num_extents = count_max_extents(len); - - /* - * If we have an outstanding_extents count still set then we're - * within our reservation, otherwise we need to adjust our inode - * counter appropriately. - */ - if (dio_data->outstanding_extents >= num_extents) { - dio_data->outstanding_extents -= num_extents; - } else { - /* - * If dio write length has been split due to no large enough - * contiguous space, we need to compensate our inode counter - * appropriately. - */ - u64 num_needed = num_extents - dio_data->outstanding_extents; - - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents += num_needed; - spin_unlock(&BTRFS_I(inode)->lock); - } -} - static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -7973,7 +7896,6 @@ unlock: if (!dio_data->overwrite && start + len > i_size_read(inode)) i_size_write(inode, start + len); - adjust_dio_outstanding_extents(inode, dio_data, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; dio_data->unsubmitted_oe_range_end = start + len; @@ -8003,14 +7925,6 @@ unlock_err: err: if (dio_data) current->journal_info = dio_data; - /* - * Compensate the delalloc release we do in btrfs_direct_IO() when we - * write less data then expected, so that we don't underflow our inode's - * outstanding extents counter. - */ - if (create && dio_data) - adjust_dio_outstanding_extents(inode, dio_data, len); - return ret; } @@ -8357,11 +8271,8 @@ static void btrfs_endio_direct_read(struct bio *bio) struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); blk_status_t err = bio->bi_status; - if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) { + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) err = btrfs_subio_endio_read(inode, io_bio, err); - if (!err) - bio->bi_status = 0; - } unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); @@ -8369,7 +8280,7 @@ static void btrfs_endio_direct_read(struct bio *bio) kfree(dip); - dio_bio->bi_status = bio->bi_status; + dio_bio->bi_status = err; dio_end_io(dio_bio); if (io_bio->end_io) @@ -8387,6 +8298,7 @@ static void __endio_write_update_ordered(struct inode *inode, btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; + u64 last_offset; int ret; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { @@ -8398,6 +8310,7 @@ static void __endio_write_update_ordered(struct inode *inode, } again: + last_offset = ordered_offset; ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, @@ -8409,6 +8322,12 @@ again: btrfs_queue_work(wq, &ordered->work); out_test: /* + * If btrfs_dec_test_ordered_pending does not find any ordered extent + * in the range, we can exit. + */ + if (ordered_offset == last_offset) + return; + /* * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again */ @@ -8478,7 +8397,7 @@ static void btrfs_end_dio_bio(struct bio *bio) if (dip->errors) { bio_io_error(dip->orig_bio); } else { - dip->dio_bio->bi_status = 0; + dip->dio_bio->bi_status = BLK_STS_OK; bio_endio(dip->orig_bio); } out: @@ -8560,7 +8479,7 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, goto err; } map: - ret = btrfs_map_bio(fs_info, bio, 0, async_submit); + ret = btrfs_map_bio(fs_info, bio, 0, 0); err: bio_put(bio); return ret; @@ -8769,7 +8688,6 @@ free_ordered: } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, - struct kiocb *iocb, const struct iov_iter *iter, loff_t offset) { int seg; @@ -8816,7 +8734,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) bool relock = false; ssize_t ret; - if (check_direct_IO(fs_info, iocb, iter, offset)) + if (check_direct_IO(fs_info, iter, offset)) return 0; inode_dio_begin(inode); @@ -8851,7 +8769,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) offset, count); if (ret) goto out; - dio_data.outstanding_extents = count_max_extents(count); /* * We need to know how many extents we reserved so that we can @@ -8898,6 +8815,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, data_reserved, offset, count - (size_t)ret); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); } out: if (wakeup) @@ -9215,9 +9133,6 @@ again: fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE - reserved_space); } @@ -9269,12 +9184,14 @@ again: out_unlock: if (!ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; } unlock_page(page); out: + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, reserved_space); out_noreserve: @@ -9370,12 +9287,12 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); + trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) { err = ret; break; } - trans->block_rsv = &fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret) { err = ret; @@ -9399,6 +9316,27 @@ static int btrfs_truncate(struct inode *inode) trans->block_rsv = rsv; } + /* + * We can't call btrfs_truncate_block inside a trans handle as we could + * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know + * we've truncated everything except the last little bit, and can do + * btrfs_truncate_block and then update the disk_i_size. + */ + if (ret == NEED_TRUNCATE_BLOCK) { + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + + ret = btrfs_truncate_block(inode, inode->i_size, 0, 0); + if (ret) + goto out; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + } + if (ret == 0 && inode->i_nlink > 0) { trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, BTRFS_I(inode)); @@ -9463,6 +9401,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct inode *btrfs_alloc_inode(struct super_block *sb) { + struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; @@ -9489,8 +9428,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) spin_lock_init(&ei->lock); ei->outstanding_extents = 0; - ei->reserved_extents = 0; - + if (sb->s_magic != BTRFS_TEST_MAGIC) + btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, + BTRFS_BLOCK_RSV_DELALLOC); ei->runtime_flags = 0; ei->prop_compress = BTRFS_COMPRESS_NONE; ei->defrag_compress = BTRFS_COMPRESS_NONE; @@ -9540,8 +9480,9 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!hlist_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + WARN_ON(BTRFS_I(inode)->block_rsv.reserved); + WARN_ON(BTRFS_I(inode)->block_rsv.size); WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); @@ -10320,19 +10261,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) ret = __start_delalloc_inodes(root, delay_iput, -1); if (ret > 0) ret = 0; - /* - * the filemap_flush will queue IO into the worker threads, but - * we have to make sure the IO is actually started and that - * ordered extents get created before we return - */ - atomic_inc(&fs_info->async_submit_draining); - while (atomic_read(&fs_info->nr_async_submits) || - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0 && - atomic_read(&fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&fs_info->async_submit_draining); return ret; } @@ -10374,14 +10302,6 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, spin_unlock(&fs_info->delalloc_root_lock); ret = 0; - atomic_inc(&fs_info->async_submit_draining); - while (atomic_read(&fs_info->nr_async_submits) || - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0 && - atomic_read(&fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&fs_info->async_submit_draining); out: if (!list_empty_careful(&splice)) { spin_lock(&fs_info->delalloc_root_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d6715c2bcdc4..fd172a93d11a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -86,6 +86,19 @@ struct btrfs_ioctl_received_subvol_args_32 { struct btrfs_ioctl_received_subvol_args_32) #endif +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) +struct btrfs_ioctl_send_args_32 { + __s64 send_fd; /* in */ + __u64 clone_sources_count; /* in */ + compat_uptr_t clone_sources; /* in */ + __u64 parent_root; /* in */ + __u64 flags; /* in */ + __u64 reserved[4]; /* in */ +} __attribute__ ((__packed__)); + +#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ + struct btrfs_ioctl_send_args_32) +#endif static int btrfs_clone(struct inode *src, struct inode *inode, u64 off, u64 olen, u64 olen_aligned, u64 destoff, @@ -609,23 +622,6 @@ fail_free: return ret; } -static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root) -{ - s64 writers; - DEFINE_WAIT(wait); - - do { - prepare_to_wait(&root->subv_writers->wait, &wait, - TASK_UNINTERRUPTIBLE); - - writers = percpu_counter_sum(&root->subv_writers->counter); - if (writers) - schedule(); - - finish_wait(&root->subv_writers->wait, &wait); - } while (writers); -} - static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct dentry *dentry, u64 *async_transid, bool readonly, @@ -654,7 +650,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, atomic_inc(&root->will_be_snapshotted); smp_mb__after_atomic(); - btrfs_wait_for_no_snapshotting_writes(root); + /* wait for no snapshot writes */ + wait_event(root->subv_writers->wait, + percpu_counter_sum(&root->subv_writers->counter) == 0); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) @@ -1219,6 +1217,7 @@ again: unlock_page(pages[i]); put_page(pages[i]); } + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; out: @@ -1229,6 +1228,7 @@ out: btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; @@ -1420,21 +1420,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, filemap_flush(inode->i_mapping); } - if (do_compress) { - /* the filemap_flush will queue IO into the worker threads, but - * we have to make sure the IO is actually started and that - * ordered extents get created before we return - */ - atomic_inc(&fs_info->async_submit_draining); - while (atomic_read(&fs_info->nr_async_submits) || - atomic_read(&fs_info->async_delalloc_pages)) { - wait_event(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) == 0 && - atomic_read(&fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&fs_info->async_submit_draining); - } - if (range->compress_type == BTRFS_COMPRESS_LZO) { btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { @@ -1842,8 +1827,13 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + if (ret < 0) { + btrfs_end_transaction(trans); + goto out_reset; + } + + ret = btrfs_commit_transaction(trans); - btrfs_commit_transaction(trans); out_reset: if (ret) btrfs_set_root_flags(&root->root_item, root_flags); @@ -2179,7 +2169,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, inode = file_inode(file); ret = search_ioctl(inode, &args.key, &buf_size, - (char *)(&uarg->buf[0])); + (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; else if (ret == -EOVERFLOW && @@ -2773,9 +2763,9 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } mutex_unlock(&fs_devices->device_list_mutex); - fi_args->nodesize = fs_info->super_copy->nodesize; - fi_args->sectorsize = fs_info->super_copy->sectorsize; - fi_args->clone_alignment = fs_info->super_copy->sectorsize; + fi_args->nodesize = fs_info->nodesize; + fi_args->sectorsize = fs_info->sectorsize; + fi_args->clone_alignment = fs_info->sectorsize; if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; @@ -3032,7 +3022,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, out: if (ret) btrfs_cmp_data_free(cmp); - return 0; + return ret; } static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp) @@ -3706,7 +3696,7 @@ process_slot: if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, - fs_info, + root, disko, diskl, 0, root->root_key.objectid, btrfs_ino(BTRFS_I(inode)), @@ -4061,6 +4051,10 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } + if (!is_fstree(new_root->objectid)) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); if (!path) { @@ -4125,10 +4119,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_space_info *dest_orig; struct btrfs_ioctl_space_info __user *user_dest; struct btrfs_space_info *info; - u64 types[] = {BTRFS_BLOCK_GROUP_DATA, - BTRFS_BLOCK_GROUP_SYSTEM, - BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + static const u64 types[] = { + BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA + }; int num_types = 4; int alloc_size; int ret = 0; @@ -4500,8 +4496,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) ipath->fspath->val[i] = rel_ptr; } - ret = copy_to_user((void *)(unsigned long)ipa->fspath, - (void *)(unsigned long)ipath->fspath, size); + ret = copy_to_user((void __user *)(unsigned long)ipa->fspath, + ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; @@ -4536,13 +4532,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) } static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, - void __user *arg) + void __user *arg, int version) { int ret = 0; int size; struct btrfs_ioctl_logical_ino_args *loi; struct btrfs_data_container *inodes = NULL; struct btrfs_path *path = NULL; + bool ignore_offset; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4551,13 +4548,30 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, if (IS_ERR(loi)) return PTR_ERR(loi); + if (version == 1) { + ignore_offset = false; + size = min_t(u32, loi->size, SZ_64K); + } else { + /* All reserved bits must be 0 for now */ + if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) { + ret = -EINVAL; + goto out_loi; + } + /* Only accept flags we have defined so far */ + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) { + ret = -EINVAL; + goto out_loi; + } + ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET; + size = min_t(u32, loi->size, SZ_16M); + } + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } - size = min_t(u32, loi->size, SZ_64K); inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); @@ -4566,20 +4580,21 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes); + build_ino_list, inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) goto out; - ret = copy_to_user((void *)(unsigned long)loi->inodes, - (void *)(unsigned long)inodes, size); + ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes, + size); if (ret) ret = -EFAULT; out: btrfs_free_path(path); kvfree(inodes); +out_loi: kfree(loi); return ret; @@ -5156,15 +5171,11 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); goto out; } } ret = btrfs_commit_transaction(trans); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - goto out; - } - out: up_write(&fs_info->subvol_sem); mnt_drop_write_file(file); @@ -5486,6 +5497,41 @@ out_drop_write: return ret; } +static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat) +{ + struct btrfs_ioctl_send_args *arg; + int ret; + + if (compat) { +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + struct btrfs_ioctl_send_args_32 args32; + + ret = copy_from_user(&args32, argp, sizeof(args32)); + if (ret) + return -EFAULT; + arg = kzalloc(sizeof(*arg), GFP_KERNEL); + if (!arg) + return -ENOMEM; + arg->send_fd = args32.send_fd; + arg->clone_sources_count = args32.clone_sources_count; + arg->clone_sources = compat_ptr(args32.clone_sources); + arg->parent_root = args32.parent_root; + arg->flags = args32.flags; + memcpy(arg->reserved, args32.reserved, + sizeof(args32.reserved)); +#else + return -ENOTTY; +#endif + } else { + arg = memdup_user(argp, sizeof(*arg)); + if (IS_ERR(arg)) + return PTR_ERR(arg); + } + ret = btrfs_ioctl_send(file, arg); + kfree(arg); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -5550,7 +5596,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: - return btrfs_ioctl_logical_to_ino(fs_info, argp); + return btrfs_ioctl_logical_to_ino(fs_info, argp, 1); + case BTRFS_IOC_LOGICAL_INO_V2: + return btrfs_ioctl_logical_to_ino(fs_info, argp, 2); case BTRFS_IOC_SPACE_INFO: return btrfs_ioctl_space_info(fs_info, argp); case BTRFS_IOC_SYNC: { @@ -5591,7 +5639,11 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return btrfs_ioctl_send(file, argp); + return _btrfs_ioctl_send(file, argp, false); +#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) + case BTRFS_IOC_SEND_32: + return _btrfs_ioctl_send(file, argp, true); +#endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); case BTRFS_IOC_QUOTA_CTL: diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d433e75d489a..6c7f18cd3b61 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -430,10 +430,15 @@ out: return ret; } +static void lzo_set_level(struct list_head *ws, unsigned int type) +{ +} + const struct btrfs_compress_op btrfs_lzo_compress = { .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, .decompress_bio = lzo_decompress_bio, .decompress = lzo_decompress, + .set_level = lzo_set_level, }; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a3aca495e33e..5b311aeddcc8 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -242,6 +242,15 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, } spin_unlock(&root->ordered_extent_lock); + /* + * We don't need the count_max_extents here, we can assume that all of + * that work has been done at higher layers, so this is truly the + * smallest the extent is going to get. + */ + spin_lock(&BTRFS_I(inode)->lock); + btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); + spin_unlock(&BTRFS_I(inode)->lock); + return 0; } @@ -591,11 +600,19 @@ void btrfs_remove_ordered_extent(struct inode *inode, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; bool dec_pending_ordered = false; - tree = &BTRFS_I(inode)->ordered_tree; + /* This is paired with btrfs_add_ordered_extent. */ + spin_lock(&btrfs_inode->lock); + btrfs_mod_outstanding_extents(btrfs_inode, -1); + spin_unlock(&btrfs_inode->lock); + if (root != fs_info->tree_root) + btrfs_delalloc_release_metadata(btrfs_inode, entry->len); + + tree = &btrfs_inode->ordered_tree; spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5c8b61c86e61..168fd03ca3ac 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -807,7 +807,6 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, } ret = 0; out: - set_bit(BTRFS_FS_QUOTA_DISABLING, &root->fs_info->flags); btrfs_free_path(path); return ret; } @@ -953,7 +952,6 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, if (!fs_info->quota_root) goto out; clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - set_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; @@ -1307,6 +1305,8 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, } } ret = del_qgroup_item(trans, quota_root, qgroupid); + if (ret && ret != -ENOENT) + goto out; while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, @@ -1441,7 +1441,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, u64 bytenr = qrecord->bytenr; int ret; - ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root); + ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false); if (ret < 0) return ret; @@ -2031,7 +2031,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, /* Search commit root to find old_roots */ ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, - &record->old_roots); + &record->old_roots, false); if (ret < 0) goto cleanup; } @@ -2042,7 +2042,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, * root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, SEQ_LAST, &new_roots); + record->bytenr, SEQ_LAST, &new_roots, false); if (ret < 0) goto cleanup; if (qgroup_to_skip) { @@ -2086,8 +2086,6 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (test_and_clear_bit(BTRFS_FS_QUOTA_ENABLING, &fs_info->flags)) set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - if (test_and_clear_bit(BTRFS_FS_QUOTA_DISABLING, &fs_info->flags)) - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); spin_lock(&fs_info->qgroup_lock); while (!list_empty(&fs_info->dirty_qgroups)) { @@ -2572,7 +2570,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, num_bytes = found.offset; ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, - &roots); + &roots, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 24a62224b24b..a7f79254ecca 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1326,6 +1326,9 @@ write_data: cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); } /* @@ -1582,6 +1585,10 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + return -EIO; finish: @@ -2107,6 +2114,10 @@ cleanup: if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + return -EIO; } @@ -2231,12 +2242,18 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, ASSERT(!bio->bi_iter.bi_size); rbio->operation = BTRFS_RBIO_PARITY_SCRUB; - for (i = 0; i < rbio->real_stripes; i++) { + /* + * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted + * to the end position, so this search can start from the first parity + * stripe. + */ + for (i = rbio->nr_data; i < rbio->real_stripes; i++) { if (bbio->stripes[i].dev == scrub_dev) { rbio->scrubp = i; break; } } + ASSERT(i < rbio->real_stripes); /* Now we just support the sectorsize equals to page size */ ASSERT(fs_info->sectorsize == PAGE_SIZE); @@ -2454,6 +2471,9 @@ submit_write: cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2563,12 +2583,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int stripe; struct bio *bio; + bio_list_init(&bio_list); + ret = alloc_rbio_essential_pages(rbio); if (ret) goto cleanup; - bio_list_init(&bio_list); - atomic_set(&rbio->error, 0); /* * build a list of bios to read all the missing parts of this @@ -2636,6 +2656,10 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) cleanup: rbio_orig_end_io(rbio, BLK_STS_IOERR); + + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + return; finish: diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c new file mode 100644 index 000000000000..34878699d363 --- /dev/null +++ b/fs/btrfs/ref-verify.c @@ -0,0 +1,1031 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/stacktrace.h> +#include "ctree.h" +#include "disk-io.h" +#include "locking.h" +#include "delayed-ref.h" +#include "ref-verify.h" + +/* + * Used to keep track the roots and number of refs each root has for a given + * bytenr. This just tracks the number of direct references, no shared + * references. + */ +struct root_entry { + u64 root_objectid; + u64 num_refs; + struct rb_node node; +}; + +/* + * These are meant to represent what should exist in the extent tree, these can + * be used to verify the extent tree is consistent as these should all match + * what the extent tree says. + */ +struct ref_entry { + u64 root_objectid; + u64 parent; + u64 owner; + u64 offset; + u64 num_refs; + struct rb_node node; +}; + +#define MAX_TRACE 16 + +/* + * Whenever we add/remove a reference we record the action. The action maps + * back to the delayed ref action. We hold the ref we are changing in the + * action so we can account for the history properly, and we record the root we + * were called with since it could be different from ref_root. We also store + * stack traces because thats how I roll. + */ +struct ref_action { + int action; + u64 root; + struct ref_entry ref; + struct list_head list; + unsigned long trace[MAX_TRACE]; + unsigned int trace_len; +}; + +/* + * One of these for every block we reference, it holds the roots and references + * to it as well as all of the ref actions that have occured to it. We never + * free it until we unmount the file system in order to make sure re-allocations + * are happening properly. + */ +struct block_entry { + u64 bytenr; + u64 len; + u64 num_refs; + int metadata; + int from_disk; + struct rb_root roots; + struct rb_root refs; + struct rb_node node; + struct list_head actions; +}; + +static struct block_entry *insert_block_entry(struct rb_root *root, + struct block_entry *be) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct block_entry *entry; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct block_entry, node); + if (entry->bytenr > be->bytenr) + p = &(*p)->rb_left; + else if (entry->bytenr < be->bytenr) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&be->node, parent_node, p); + rb_insert_color(&be->node, root); + return NULL; +} + +static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n; + struct block_entry *entry = NULL; + + n = root->rb_node; + while (n) { + entry = rb_entry(n, struct block_entry, node); + if (entry->bytenr < bytenr) + n = n->rb_right; + else if (entry->bytenr > bytenr) + n = n->rb_left; + else + return entry; + } + return NULL; +} + +static struct root_entry *insert_root_entry(struct rb_root *root, + struct root_entry *re) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct root_entry *entry; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct root_entry, node); + if (entry->root_objectid > re->root_objectid) + p = &(*p)->rb_left; + else if (entry->root_objectid < re->root_objectid) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&re->node, parent_node, p); + rb_insert_color(&re->node, root); + return NULL; + +} + +static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2) +{ + if (ref1->root_objectid < ref2->root_objectid) + return -1; + if (ref1->root_objectid > ref2->root_objectid) + return 1; + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + if (ref1->owner < ref2->owner) + return -1; + if (ref1->owner > ref2->owner) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + return 0; +} + +static struct ref_entry *insert_ref_entry(struct rb_root *root, + struct ref_entry *ref) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent_node = NULL; + struct ref_entry *entry; + int cmp; + + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct ref_entry, node); + cmp = comp_refs(entry, ref); + if (cmp > 0) + p = &(*p)->rb_left; + else if (cmp < 0) + p = &(*p)->rb_right; + else + return entry; + } + + rb_link_node(&ref->node, parent_node, p); + rb_insert_color(&ref->node, root); + return NULL; + +} + +static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid) +{ + struct rb_node *n; + struct root_entry *entry = NULL; + + n = root->rb_node; + while (n) { + entry = rb_entry(n, struct root_entry, node); + if (entry->root_objectid < objectid) + n = n->rb_right; + else if (entry->root_objectid > objectid) + n = n->rb_left; + else + return entry; + } + return NULL; +} + +#ifdef CONFIG_STACKTRACE +static void __save_stack_trace(struct ref_action *ra) +{ + struct stack_trace stack_trace; + + stack_trace.max_entries = MAX_TRACE; + stack_trace.nr_entries = 0; + stack_trace.entries = ra->trace; + stack_trace.skip = 2; + save_stack_trace(&stack_trace); + ra->trace_len = stack_trace.nr_entries; +} + +static void __print_stack_trace(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + struct stack_trace trace; + + if (ra->trace_len == 0) { + btrfs_err(fs_info, " ref-verify: no stacktrace"); + return; + } + trace.nr_entries = ra->trace_len; + trace.entries = ra->trace; + print_stack_trace(&trace, 2); +} +#else +static void inline __save_stack_trace(struct ref_action *ra) +{ +} + +static void inline __print_stack_trace(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + btrfs_err(fs_info, " ref-verify: no stacktrace support"); +} +#endif + +static void free_block_entry(struct block_entry *be) +{ + struct root_entry *re; + struct ref_entry *ref; + struct ref_action *ra; + struct rb_node *n; + + while ((n = rb_first(&be->roots))) { + re = rb_entry(n, struct root_entry, node); + rb_erase(&re->node, &be->roots); + kfree(re); + } + + while((n = rb_first(&be->refs))) { + ref = rb_entry(n, struct ref_entry, node); + rb_erase(&ref->node, &be->refs); + kfree(ref); + } + + while (!list_empty(&be->actions)) { + ra = list_first_entry(&be->actions, struct ref_action, + list); + list_del(&ra->list); + kfree(ra); + } + kfree(be); +} + +static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 len, + u64 root_objectid) +{ + struct block_entry *be = NULL, *exist; + struct root_entry *re = NULL; + + re = kzalloc(sizeof(struct root_entry), GFP_KERNEL); + be = kzalloc(sizeof(struct block_entry), GFP_KERNEL); + if (!be || !re) { + kfree(re); + kfree(be); + return ERR_PTR(-ENOMEM); + } + be->bytenr = bytenr; + be->len = len; + + re->root_objectid = root_objectid; + re->num_refs = 0; + + spin_lock(&fs_info->ref_verify_lock); + exist = insert_block_entry(&fs_info->block_tree, be); + if (exist) { + if (root_objectid) { + struct root_entry *exist_re; + + exist_re = insert_root_entry(&exist->roots, re); + if (exist_re) + kfree(re); + } + kfree(be); + return exist; + } + + be->num_refs = 0; + be->metadata = 0; + be->from_disk = 0; + be->roots = RB_ROOT; + be->refs = RB_ROOT; + INIT_LIST_HEAD(&be->actions); + if (root_objectid) + insert_root_entry(&be->roots, re); + else + kfree(re); + return be; +} + +static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root, + u64 parent, u64 bytenr, int level) +{ + struct block_entry *be; + struct root_entry *re; + struct ref_entry *ref = NULL, *exist; + + ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL); + if (!ref) + return -ENOMEM; + + if (parent) + ref->root_objectid = 0; + else + ref->root_objectid = ref_root; + ref->parent = parent; + ref->owner = level; + ref->offset = 0; + ref->num_refs = 1; + + be = add_block_entry(fs_info, bytenr, fs_info->nodesize, ref_root); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs++; + be->from_disk = 1; + be->metadata = 1; + + if (!parent) { + ASSERT(ref_root); + re = lookup_root_entry(&be->roots, ref_root); + ASSERT(re); + re->num_refs++; + } + exist = insert_ref_entry(&be->refs, ref); + if (exist) { + exist->num_refs++; + kfree(ref); + } + spin_unlock(&fs_info->ref_verify_lock); + + return 0; +} + +static int add_shared_data_ref(struct btrfs_fs_info *fs_info, + u64 parent, u32 num_refs, u64 bytenr, + u64 num_bytes) +{ + struct block_entry *be; + struct ref_entry *ref; + + ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + if (!ref) + return -ENOMEM; + be = add_block_entry(fs_info, bytenr, num_bytes, 0); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs += num_refs; + + ref->parent = parent; + ref->num_refs = num_refs; + if (insert_ref_entry(&be->refs, ref)) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "existing shared ref when reading from disk?"); + kfree(ref); + return -EINVAL; + } + spin_unlock(&fs_info->ref_verify_lock); + return 0; +} + +static int add_extent_data_ref(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *dref, + u64 bytenr, u64 num_bytes) +{ + struct block_entry *be; + struct ref_entry *ref; + struct root_entry *re; + u64 ref_root = btrfs_extent_data_ref_root(leaf, dref); + u64 owner = btrfs_extent_data_ref_objectid(leaf, dref); + u64 offset = btrfs_extent_data_ref_offset(leaf, dref); + u32 num_refs = btrfs_extent_data_ref_count(leaf, dref); + + ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + if (!ref) + return -ENOMEM; + be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); + if (IS_ERR(be)) { + kfree(ref); + return PTR_ERR(be); + } + be->num_refs += num_refs; + + ref->parent = 0; + ref->owner = owner; + ref->root_objectid = ref_root; + ref->offset = offset; + ref->num_refs = num_refs; + if (insert_ref_entry(&be->refs, ref)) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "existing ref when reading from disk?"); + kfree(ref); + return -EINVAL; + } + + re = lookup_root_entry(&be->roots, ref_root); + if (!re) { + spin_unlock(&fs_info->ref_verify_lock); + btrfs_err(fs_info, "missing root in new block entry?"); + return -EINVAL; + } + re->num_refs += num_refs; + spin_unlock(&fs_info->ref_verify_lock); + return 0; +} + +static int process_extent_item(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *key, + int slot, int *tree_block_level) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct extent_buffer *leaf = path->nodes[0]; + u32 item_size = btrfs_item_size_nr(leaf, slot); + unsigned long end, ptr; + u64 offset, flags, count; + int type, ret; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + if ((key->type == BTRFS_EXTENT_ITEM_KEY) && + flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)(ei + 1); + *tree_block_level = btrfs_tree_block_level(leaf, info); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + if (key->type == BTRFS_METADATA_ITEM_KEY) + *tree_block_level = key->offset; + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + offset = btrfs_extent_inline_ref_offset(leaf, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, offset, 0, key->objectid, + *tree_block_level); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, 0, offset, key->objectid, + *tree_block_level); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = add_extent_data_ref(fs_info, leaf, dref, + key->objectid, key->offset); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + count = btrfs_shared_data_ref_count(leaf, sref); + ret = add_shared_data_ref(fs_info, offset, count, + key->objectid, key->offset); + break; + default: + btrfs_err(fs_info, "invalid key type in iref"); + ret = -EINVAL; + break; + } + if (ret) + break; + ptr += btrfs_extent_inline_ref_size(type); + } + return ret; +} + +static int process_leaf(struct btrfs_root *root, + struct btrfs_path *path, u64 *bytenr, u64 *num_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + u32 count; + int i = 0, tree_block_level = 0, ret; + struct btrfs_key key; + int nritems = btrfs_header_nritems(leaf); + + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + switch (key.type) { + case BTRFS_EXTENT_ITEM_KEY: + *num_bytes = key.offset; + case BTRFS_METADATA_ITEM_KEY: + *bytenr = key.objectid; + ret = process_extent_item(fs_info, path, &key, i, + &tree_block_level); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, key.offset, 0, + key.objectid, tree_block_level); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + ret = add_tree_block(fs_info, 0, key.offset, + key.objectid, tree_block_level); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(leaf, i, + struct btrfs_extent_data_ref); + ret = add_extent_data_ref(fs_info, leaf, dref, *bytenr, + *num_bytes); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(leaf, i, + struct btrfs_shared_data_ref); + count = btrfs_shared_data_ref_count(leaf, sref); + ret = add_shared_data_ref(fs_info, key.offset, count, + *bytenr, *num_bytes); + break; + default: + break; + } + if (ret) + break; + } + return ret; +} + +/* Walk down to the leaf from the given level */ +static int walk_down_tree(struct btrfs_root *root, struct btrfs_path *path, + int level, u64 *bytenr, u64 *num_bytes) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_buffer *eb; + u64 block_bytenr, gen; + int ret = 0; + + while (level >= 0) { + if (level) { + block_bytenr = btrfs_node_blockptr(path->nodes[level], + path->slots[level]); + gen = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + eb = read_tree_block(fs_info, block_bytenr, gen); + if (IS_ERR(eb)) + return PTR_ERR(eb); + if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + return -EIO; + } + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->nodes[level-1] = eb; + path->slots[level-1] = 0; + path->locks[level-1] = BTRFS_READ_LOCK_BLOCKING; + } else { + ret = process_leaf(root, path, bytenr, num_bytes); + if (ret) + break; + } + level--; + } + return ret; +} + +/* Walk up to the next node that needs to be processed */ +static int walk_up_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + int l; + + for (l = 0; l < BTRFS_MAX_LEVEL; l++) { + if (!path->nodes[l]) + continue; + if (l) { + path->slots[l]++; + if (path->slots[l] < + btrfs_header_nritems(path->nodes[l])) { + *level = l; + return 0; + } + } + btrfs_tree_unlock_rw(path->nodes[l], path->locks[l]); + free_extent_buffer(path->nodes[l]); + path->nodes[l] = NULL; + path->slots[l] = 0; + path->locks[l] = 0; + } + + return 1; +} + +static void dump_ref_action(struct btrfs_fs_info *fs_info, + struct ref_action *ra) +{ + btrfs_err(fs_info, +" Ref action %d, root %llu, ref_root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", + ra->action, ra->root, ra->ref.root_objectid, ra->ref.parent, + ra->ref.owner, ra->ref.offset, ra->ref.num_refs); + __print_stack_trace(fs_info, ra); +} + +/* + * Dumps all the information from the block entry to printk, it's going to be + * awesome. + */ +static void dump_block_entry(struct btrfs_fs_info *fs_info, + struct block_entry *be) +{ + struct ref_entry *ref; + struct root_entry *re; + struct ref_action *ra; + struct rb_node *n; + + btrfs_err(fs_info, +"dumping block entry [%llu %llu], num_refs %llu, metadata %d, from disk %d", + be->bytenr, be->len, be->num_refs, be->metadata, + be->from_disk); + + for (n = rb_first(&be->refs); n; n = rb_next(n)) { + ref = rb_entry(n, struct ref_entry, node); + btrfs_err(fs_info, +" ref root %llu, parent %llu, owner %llu, offset %llu, num_refs %llu", + ref->root_objectid, ref->parent, ref->owner, + ref->offset, ref->num_refs); + } + + for (n = rb_first(&be->roots); n; n = rb_next(n)) { + re = rb_entry(n, struct root_entry, node); + btrfs_err(fs_info, " root entry %llu, num_refs %llu", + re->root_objectid, re->num_refs); + } + + list_for_each_entry(ra, &be->actions, list) + dump_ref_action(fs_info, ra); +} + +/* + * btrfs_ref_tree_mod: called when we modify a ref for a bytenr + * @root: the root we are making this modification from. + * @bytenr: the bytenr we are modifying. + * @num_bytes: number of bytes. + * @parent: the parent bytenr. + * @ref_root: the original root owner of the bytenr. + * @owner: level in the case of metadata, inode in the case of data. + * @offset: 0 for metadata, file offset for data. + * @action: the action that we are doing, this is the same as the delayed ref + * action. + * + * This will add an action item to the given bytenr and do sanity checks to make + * sure we haven't messed something up. If we are making a new allocation and + * this block entry has history we will delete all previous actions as long as + * our sanity checks pass as they are no longer needed. + */ +int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, u64 owner, u64 offset, + int action) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct ref_entry *ref = NULL, *exist; + struct ref_action *ra = NULL; + struct block_entry *be = NULL; + struct root_entry *re = NULL; + int ret = 0; + bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID; + + if (!btrfs_test_opt(root->fs_info, REF_VERIFY)) + return 0; + + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); + ra = kmalloc(sizeof(struct ref_action), GFP_NOFS); + if (!ra || !ref) { + kfree(ref); + kfree(ra); + ret = -ENOMEM; + goto out; + } + + if (parent) { + ref->parent = parent; + } else { + ref->root_objectid = ref_root; + ref->owner = owner; + ref->offset = offset; + } + ref->num_refs = (action == BTRFS_DROP_DELAYED_REF) ? -1 : 1; + + memcpy(&ra->ref, ref, sizeof(struct ref_entry)); + /* + * Save the extra info from the delayed ref in the ref action to make it + * easier to figure out what is happening. The real ref's we add to the + * ref tree need to reflect what we save on disk so it matches any + * on-disk refs we pre-loaded. + */ + ra->ref.owner = owner; + ra->ref.offset = offset; + ra->ref.root_objectid = ref_root; + __save_stack_trace(ra); + + INIT_LIST_HEAD(&ra->list); + ra->action = action; + ra->root = root->objectid; + + /* + * This is an allocation, preallocate the block_entry in case we haven't + * used it before. + */ + ret = -EINVAL; + if (action == BTRFS_ADD_DELAYED_EXTENT) { + /* + * For subvol_create we'll just pass in whatever the parent root + * is and the new root objectid, so let's not treat the passed + * in root as if it really has a ref for this bytenr. + */ + be = add_block_entry(root->fs_info, bytenr, num_bytes, ref_root); + if (IS_ERR(be)) { + kfree(ra); + ret = PTR_ERR(be); + goto out; + } + be->num_refs++; + if (metadata) + be->metadata = 1; + + if (be->num_refs != 1) { + btrfs_err(fs_info, + "re-allocated a block that still has references to it!"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + goto out_unlock; + } + + while (!list_empty(&be->actions)) { + struct ref_action *tmp; + + tmp = list_first_entry(&be->actions, struct ref_action, + list); + list_del(&tmp->list); + kfree(tmp); + } + } else { + struct root_entry *tmp; + + if (!parent) { + re = kmalloc(sizeof(struct root_entry), GFP_NOFS); + if (!re) { + kfree(ref); + kfree(ra); + ret = -ENOMEM; + goto out; + } + /* + * This is the root that is modifying us, so it's the + * one we want to lookup below when we modify the + * re->num_refs. + */ + ref_root = root->objectid; + re->root_objectid = root->objectid; + re->num_refs = 0; + } + + spin_lock(&root->fs_info->ref_verify_lock); + be = lookup_block_entry(&root->fs_info->block_tree, bytenr); + if (!be) { + btrfs_err(fs_info, +"trying to do action %d to bytenr %llu num_bytes %llu but there is no existing entry!", + action, (unsigned long long)bytenr, + (unsigned long long)num_bytes); + dump_ref_action(fs_info, ra); + kfree(ref); + kfree(ra); + goto out_unlock; + } + + if (!parent) { + tmp = insert_root_entry(&be->roots, re); + if (tmp) { + kfree(re); + re = tmp; + } + } + } + + exist = insert_ref_entry(&be->refs, ref); + if (exist) { + if (action == BTRFS_DROP_DELAYED_REF) { + if (exist->num_refs == 0) { + btrfs_err(fs_info, +"dropping a ref for a existing root that doesn't have a ref on the block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + exist->num_refs--; + if (exist->num_refs == 0) { + rb_erase(&exist->node, &be->refs); + kfree(exist); + } + } else if (!be->metadata) { + exist->num_refs++; + } else { + btrfs_err(fs_info, +"attempting to add another ref for an existing ref on a tree block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + kfree(ref); + } else { + if (action == BTRFS_DROP_DELAYED_REF) { + btrfs_err(fs_info, +"dropping a ref for a root that doesn't have a ref on the block"); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + } + + if (!parent && !re) { + re = lookup_root_entry(&be->roots, ref_root); + if (!re) { + /* + * This shouldn't happen because we will add our re + * above when we lookup the be with !parent, but just in + * case catch this case so we don't panic because I + * didn't thik of some other corner case. + */ + btrfs_err(fs_info, "failed to find root %llu for %llu", + root->objectid, be->bytenr); + dump_block_entry(fs_info, be); + dump_ref_action(fs_info, ra); + kfree(ra); + goto out_unlock; + } + } + if (action == BTRFS_DROP_DELAYED_REF) { + if (re) + re->num_refs--; + be->num_refs--; + } else if (action == BTRFS_ADD_DELAYED_REF) { + be->num_refs++; + if (re) + re->num_refs++; + } + list_add_tail(&ra->list, &be->actions); + ret = 0; +out_unlock: + spin_unlock(&root->fs_info->ref_verify_lock); +out: + if (ret) + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + return ret; +} + +/* Free up the ref cache */ +void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) +{ + struct block_entry *be; + struct rb_node *n; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return; + + spin_lock(&fs_info->ref_verify_lock); + while ((n = rb_first(&fs_info->block_tree))) { + be = rb_entry(n, struct block_entry, node); + rb_erase(&be->node, &fs_info->block_tree); + free_block_entry(be); + cond_resched_lock(&fs_info->ref_verify_lock); + } + spin_unlock(&fs_info->ref_verify_lock); +} + +void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, + u64 len) +{ + struct block_entry *be = NULL, *entry; + struct rb_node *n; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return; + + spin_lock(&fs_info->ref_verify_lock); + n = fs_info->block_tree.rb_node; + while (n) { + entry = rb_entry(n, struct block_entry, node); + if (entry->bytenr < start) { + n = n->rb_right; + } else if (entry->bytenr > start) { + n = n->rb_left; + } else { + be = entry; + break; + } + /* We want to get as close to start as possible */ + if (be == NULL || + (entry->bytenr < start && be->bytenr > start) || + (entry->bytenr < start && entry->bytenr > be->bytenr)) + be = entry; + } + + /* + * Could have an empty block group, maybe have something to check for + * this case to verify we were actually empty? + */ + if (!be) { + spin_unlock(&fs_info->ref_verify_lock); + return; + } + + n = &be->node; + while (n) { + be = rb_entry(n, struct block_entry, node); + n = rb_next(n); + if (be->bytenr < start && be->bytenr + be->len > start) { + btrfs_err(fs_info, + "block entry overlaps a block group [%llu,%llu]!", + start, len); + dump_block_entry(fs_info, be); + continue; + } + if (be->bytenr < start) + continue; + if (be->bytenr >= start + len) + break; + if (be->bytenr + be->len > start + len) { + btrfs_err(fs_info, + "block entry overlaps a block group [%llu,%llu]!", + start, len); + dump_block_entry(fs_info, be); + } + rb_erase(&be->node, &fs_info->block_tree); + free_block_entry(be); + } + spin_unlock(&fs_info->ref_verify_lock); +} + +/* Walk down all roots and build the ref tree, meant to be called at mount */ +int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_path *path; + struct btrfs_root *root; + struct extent_buffer *eb; + u64 bytenr = 0, num_bytes = 0; + int ret, level; + + if (!btrfs_test_opt(fs_info, REF_VERIFY)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + eb = btrfs_read_lock_root_node(fs_info->extent_root); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + level = btrfs_header_level(eb); + path->nodes[level] = eb; + path->slots[level] = 0; + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + while (1) { + /* + * We have to keep track of the bytenr/num_bytes we last hit + * because we could have run out of space for an inline ref, and + * would have had to added a ref key item which may appear on a + * different leaf from the original extent item. + */ + ret = walk_down_tree(fs_info->extent_root, path, level, + &bytenr, &num_bytes); + if (ret) + break; + ret = walk_up_tree(root, path, &level); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + } + if (ret) { + btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY); + btrfs_free_ref_cache(fs_info); + } + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h new file mode 100644 index 000000000000..3bf02ce0e1e2 --- /dev/null +++ b/fs/btrfs/ref-verify.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __REF_VERIFY__ +#define __REF_VERIFY__ + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY +int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info); +void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info); +int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, u64 owner, u64 offset, + int action); +void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start, + u64 len); + +static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->ref_verify_lock); + fs_info->block_tree = RB_ROOT; +} +#else +static inline int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info) +{ + return 0; +} + +static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info) +{ +} + +static inline int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 parent, u64 ref_root, + u64 owner, u64 offset, int action) +{ + return 0; +} + +static inline void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, + u64 start, u64 len) +{ +} + +static inline void btrfs_init_ref_verify(struct btrfs_fs_info *fs_info) +{ +} + +#endif /* CONFIG_BTRFS_FS_REF_VERIFY */ +#endif /* _REF_VERIFY__ */ diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3a49a3c2fca4..4cf2eb67eba6 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1742,7 +1742,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); - ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset); @@ -1751,7 +1751,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, break; } - ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { @@ -1952,21 +1952,21 @@ again: path->slots[level], old_ptr_gen); btrfs_mark_buffer_dirty(path->nodes[level]); - ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr, + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, src->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize, + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, src->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize, + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, 0); BUG_ON(ret); @@ -2400,11 +2400,11 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->node = NULL; reloc_root->commit_root = NULL; - __del_reloc_root(reloc_root); } } @@ -2808,7 +2808,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(upper->eb); - ret = btrfs_inc_extent_ref(trans, root->fs_info, + ret = btrfs_inc_extent_ref(trans, root, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), @@ -3246,6 +3246,8 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); ret = -EIO; goto out; } @@ -3275,6 +3277,7 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); index++; + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 95bcc3cce78f..3338407ef0f0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -226,10 +226,6 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *root; int err = 0; int ret; - bool can_recover = true; - - if (sb_rdonly(fs_info->sb)) - can_recover = false; path = btrfs_alloc_path(); if (!path) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e3f6c49e5c4d..b2f871d80982 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -231,7 +231,7 @@ struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; const char *errstr; - sector_t sector; + u64 physical; u64 logical; struct btrfs_device *dev; }; @@ -797,10 +797,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", +"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), - (unsigned long long)swarn->sector, + swarn->physical, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, (char *)(unsigned long)ipath->fspath->val[i]); @@ -810,10 +810,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, err: btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), - (unsigned long long)swarn->sector, + swarn->physical, root, inum, offset, ret); free_ipath(ipath); @@ -845,7 +845,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (!path) return; - swarn.sector = (sblock->pagev[0]->physical) >> 9; + swarn.physical = sblock->pagev[0]->physical; swarn.logical = sblock->pagev[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -868,10 +868,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) item_size, &ref_root, &ref_level); btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu", +"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, rcu_str_deref(dev->name), - (unsigned long long)swarn.sector, + swarn.physical, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, ret < 0 ? -1 : ref_root); @@ -883,7 +883,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, - scrub_print_warning_inode, &swarn); + scrub_print_warning_inode, &swarn, false); } out: @@ -1047,7 +1047,7 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) * can be found. */ ret = iterate_inodes_from_logical(fixup->logical, fs_info, path, - scrub_fixup_readpage, fixup); + scrub_fixup_readpage, fixup, false); if (ret < 0) { uncorrectable = 1; goto out; @@ -4390,7 +4390,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) } ret = iterate_inodes_from_logical(logical, fs_info, path, - record_inode_for_nocow, nocow_ctx); + record_inode_for_nocow, nocow_ctx, false); if (ret != 0 && ret != -ENOENT) { btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d", diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 32b043ef8ac9..c10e4c70f02d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -26,6 +26,7 @@ #include <linux/radix-tree.h> #include <linux/vmalloc.h> #include <linux/string.h> +#include <linux/compat.h> #include "send.h" #include "backref.h" @@ -992,7 +993,6 @@ typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key, * path must point to the dir item when called. */ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *found_key, iterate_dir_item_t iterate, void *ctx) { int ret = 0; @@ -1271,12 +1271,6 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) */ if (ino >= bctx->cur_objectid) return 0; -#if 0 - if (ino > bctx->cur_objectid) - return 0; - if (offset + bctx->extent_len > bctx->cur_offset) - return 0; -#endif } bctx->found++; @@ -1429,7 +1423,7 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = 0; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, - backref_ctx); + backref_ctx, false); if (ret < 0) goto out; @@ -2630,7 +2624,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) } else { btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; goto out; } @@ -4106,8 +4100,8 @@ out: return ret; } -static int record_ref(struct btrfs_root *root, int num, u64 dir, int index, - struct fs_path *name, void *ctx, struct list_head *refs) +static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, + void *ctx, struct list_head *refs) { int ret = 0; struct send_ctx *sctx = ctx; @@ -4143,8 +4137,7 @@ static int __record_new_ref(int num, u64 dir, int index, void *ctx) { struct send_ctx *sctx = ctx; - return record_ref(sctx->send_root, num, dir, index, name, - ctx, &sctx->new_refs); + return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); } @@ -4153,8 +4146,8 @@ static int __record_deleted_ref(int num, u64 dir, int index, void *ctx) { struct send_ctx *sctx = ctx; - return record_ref(sctx->parent_root, num, dir, index, name, - ctx, &sctx->deleted_refs); + return record_ref(sctx->parent_root, dir, name, ctx, + &sctx->deleted_refs); } static int record_new_ref(struct send_ctx *sctx) @@ -4498,7 +4491,7 @@ static int process_new_xattr(struct send_ctx *sctx) int ret = 0; ret = iterate_dir_item(sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_new_xattr, sctx); + __process_new_xattr, sctx); return ret; } @@ -4506,7 +4499,7 @@ static int process_new_xattr(struct send_ctx *sctx) static int process_deleted_xattr(struct send_ctx *sctx) { return iterate_dir_item(sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_deleted_xattr, sctx); + __process_deleted_xattr, sctx); } struct find_xattr_ctx { @@ -4551,7 +4544,7 @@ static int find_xattr(struct btrfs_root *root, ctx.found_data = NULL; ctx.found_data_len = 0; - ret = iterate_dir_item(root, path, key, __find_xattr, &ctx); + ret = iterate_dir_item(root, path, __find_xattr, &ctx); if (ret < 0) return ret; @@ -4621,11 +4614,11 @@ static int process_changed_xattr(struct send_ctx *sctx) int ret = 0; ret = iterate_dir_item(sctx->send_root, sctx->left_path, - sctx->cmp_key, __process_changed_new_xattr, sctx); + __process_changed_new_xattr, sctx); if (ret < 0) goto out; ret = iterate_dir_item(sctx->parent_root, sctx->right_path, - sctx->cmp_key, __process_changed_deleted_xattr, sctx); + __process_changed_deleted_xattr, sctx); out: return ret; @@ -4675,8 +4668,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) goto out; } - ret = iterate_dir_item(root, path, &found_key, - __process_new_xattr, sctx); + ret = iterate_dir_item(root, path, __process_new_xattr, sctx); if (ret < 0) goto out; @@ -4723,16 +4715,27 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) /* initial readahead */ memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, inode->i_mapping); - page_cache_sync_readahead(inode->i_mapping, &sctx->ra, NULL, index, - last_index - index + 1); while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); + + page = find_lock_page(inode->i_mapping, index); if (!page) { - ret = -ENOMEM; - break; + page_cache_sync_readahead(inode->i_mapping, &sctx->ra, + NULL, index, last_index + 1 - index); + + page = find_or_create_page(inode->i_mapping, index, + GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + break; + } + } + + if (PageReadahead(page)) { + page_cache_async_readahead(inode->i_mapping, &sctx->ra, + NULL, page, index, last_index + 1 - index); } if (!PageUptodate(page)) { @@ -6162,9 +6165,7 @@ out: * Updates compare related fields in sctx and simply forwards to the actual * changed_xxx functions. */ -static int changed_cb(struct btrfs_root *left_root, - struct btrfs_root *right_root, - struct btrfs_path *left_path, +static int changed_cb(struct btrfs_path *left_path, struct btrfs_path *right_path, struct btrfs_key *key, enum btrfs_compare_tree_result result, @@ -6246,8 +6247,8 @@ static int full_send_tree(struct send_ctx *sctx) slot = path->slots[0]; btrfs_item_key_to_cpu(eb, &found_key, slot); - ret = changed_cb(send_root, NULL, path, NULL, - &found_key, BTRFS_COMPARE_TREE_NEW, sctx); + ret = changed_cb(path, NULL, &found_key, + BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; @@ -6365,13 +6366,12 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) spin_unlock(&root->root_item_lock); } -long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) +long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) { int ret = 0; struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; - struct btrfs_ioctl_send_args *arg = NULL; struct btrfs_key key; struct send_ctx *sctx = NULL; u32 i; @@ -6407,13 +6407,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } - arg = memdup_user(arg_, sizeof(*arg)); - if (IS_ERR(arg)) { - ret = PTR_ERR(arg); - arg = NULL; - goto out; - } - /* * Check that we don't overflow at later allocations, we request * clone_sources_count + 1 items, and compare to unsigned long inside @@ -6654,7 +6647,6 @@ out: if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) btrfs_root_dec_send_in_progress(sctx->parent_root); - kfree(arg); kvfree(clone_sources_tmp); if (sctx) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 02e00166c4da..3aa4bc55754f 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -130,5 +130,5 @@ enum { #define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) #ifdef __KERNEL__ -long btrfs_ioctl_send(struct file *mnt_file, void __user *arg); +long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 35a128acfbd1..65af029559b5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -202,7 +202,6 @@ static struct ratelimit_state printk_limits[] = { void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { - struct super_block *sb = fs_info->sb; char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; va_list args; @@ -228,7 +227,8 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) vaf.va = &args; if (__ratelimit(ratelimit)) - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, + fs_info ? fs_info->sb->s_id : "<unknown>", &vaf); va_end(args); } @@ -292,7 +292,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, vaf.va = &args; errstr = btrfs_decode_error(errno); - if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) + if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", s_id, function, line, &vaf, errno, errstr); @@ -326,6 +326,9 @@ enum { #ifdef CONFIG_BTRFS_DEBUG Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + Opt_ref_verify, +#endif Opt_err, }; @@ -387,6 +390,9 @@ static const match_table_t tokens = { {Opt_fragment_metadata, "fragment=metadata"}, {Opt_fragment_all, "fragment=all"}, #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + {Opt_ref_verify, "ref_verify"}, +#endif {Opt_err, NULL}, }; @@ -502,6 +508,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, strncmp(args[0].from, "zlib", 4) == 0) { compress_type = "zlib"; info->compress_type = BTRFS_COMPRESS_ZLIB; + info->compress_level = + btrfs_compress_str2level(args[0].from); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -549,9 +557,9 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force != saved_compress_force)) || (!btrfs_test_opt(info, COMPRESS) && no_compress == 1)) { - btrfs_info(info, "%s %s compression", + btrfs_info(info, "%s %s compression, level %d", (compress_force) ? "force" : "use", - compress_type); + compress_type, info->compress_level); } compress_force = false; break; @@ -825,6 +833,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); break; #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + case Opt_ref_verify: + btrfs_info(info, "doing ref verification"); + btrfs_set_opt(info->mount_opt, REF_VERIFY); + break; +#endif case Opt_err: btrfs_info(info, "unrecognized mount option '%s'", p); ret = -EINVAL; @@ -1135,7 +1149,7 @@ static int btrfs_fill_super(struct super_block *sb, #ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; #endif - sb->s_flags |= MS_I_VERSION; + sb->s_flags |= SB_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; err = super_setup_bdi(sb); @@ -1205,8 +1219,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) * happens. The pending operations are delayed to the * next commit after thawing. */ - if (__sb_start_write(sb, SB_FREEZE_WRITE, false)) - __sb_end_write(sb, SB_FREEZE_WRITE); + if (sb_start_write_trylock(sb)) + sb_end_write(sb); else return 0; trans = btrfs_start_transaction(root, 0); @@ -1246,6 +1260,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); + if (info->compress_level) + seq_printf(seq, ":%d", info->compress_level); } if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); @@ -1305,6 +1321,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, FRAGMENT_METADATA)) seq_puts(seq, ",fragment=metadata"); #endif + if (btrfs_test_opt(info, REF_VERIFY)) + seq_puts(seq, ",ref_verify"); seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); seq_puts(seq, ",subvol="); @@ -2112,7 +2130,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * succeed even if the Avail is zero. But this is better than the other * way around. */ - thresh = 4 * 1024 * 1024; + thresh = SZ_4M; if (!mixed && total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; @@ -2319,6 +2337,9 @@ static void btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY ", integrity-checker=on" #endif +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + ", ref-verify=on" +#endif "\n", btrfs_crc32c_impl()); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 883881b16c86..a28bba801264 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -247,7 +247,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_size, global_rsv_size_show); +BTRFS_ATTR(allocation, global_rsv_size, global_rsv_size_show); static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -256,15 +256,15 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show); +BTRFS_ATTR(allocation, global_rsv_reserved, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); -BTRFS_RAID_ATTR(total_bytes, raid_bytes_show); -BTRFS_RAID_ATTR(used_bytes, raid_bytes_show); +BTRFS_ATTR(raid, total_bytes, raid_bytes_show); +BTRFS_ATTR(raid, used_bytes, raid_bytes_show); static ssize_t raid_bytes_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -277,7 +277,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, down_read(&sinfo->groups_sem); list_for_each_entry(block_group, &sinfo->block_groups[index], list) { - if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes)) + if (&attr->attr == BTRFS_ATTR_PTR(raid, total_bytes)) val += block_group->key.offset; else val += btrfs_block_group_used(&block_group->item); @@ -287,8 +287,8 @@ static ssize_t raid_bytes_show(struct kobject *kobj, } static struct attribute *raid_attributes[] = { - BTRFS_RAID_ATTR_PTR(total_bytes), - BTRFS_RAID_ATTR_PTR(used_bytes), + BTRFS_ATTR_PTR(raid, total_bytes), + BTRFS_ATTR_PTR(raid, used_bytes), NULL }; @@ -311,7 +311,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ struct btrfs_space_info *sinfo = to_space_info(kobj); \ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ } \ -BTRFS_ATTR(field, btrfs_space_info_show_##field) +BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, struct kobj_attribute *a, @@ -331,19 +331,20 @@ SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); -BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); +BTRFS_ATTR(space_info, total_bytes_pinned, + btrfs_space_info_show_total_bytes_pinned); static struct attribute *space_info_attrs[] = { - BTRFS_ATTR_PTR(flags), - BTRFS_ATTR_PTR(total_bytes), - BTRFS_ATTR_PTR(bytes_used), - BTRFS_ATTR_PTR(bytes_pinned), - BTRFS_ATTR_PTR(bytes_reserved), - BTRFS_ATTR_PTR(bytes_may_use), - BTRFS_ATTR_PTR(bytes_readonly), - BTRFS_ATTR_PTR(disk_used), - BTRFS_ATTR_PTR(disk_total), - BTRFS_ATTR_PTR(total_bytes_pinned), + BTRFS_ATTR_PTR(space_info, flags), + BTRFS_ATTR_PTR(space_info, total_bytes), + BTRFS_ATTR_PTR(space_info, bytes_used), + BTRFS_ATTR_PTR(space_info, bytes_pinned), + BTRFS_ATTR_PTR(space_info, bytes_reserved), + BTRFS_ATTR_PTR(space_info, bytes_may_use), + BTRFS_ATTR_PTR(space_info, bytes_readonly), + BTRFS_ATTR_PTR(space_info, disk_used), + BTRFS_ATTR_PTR(space_info, disk_total), + BTRFS_ATTR_PTR(space_info, total_bytes_pinned), NULL, }; @@ -361,8 +362,8 @@ struct kobj_type space_info_ktype = { }; static const struct attribute *allocation_attrs[] = { - BTRFS_ATTR_PTR(global_rsv_reserved), - BTRFS_ATTR_PTR(global_rsv_size), + BTRFS_ATTR_PTR(allocation, global_rsv_reserved), + BTRFS_ATTR_PTR(allocation, global_rsv_size), NULL, }; @@ -415,7 +416,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, return len; } -BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); +BTRFS_ATTR_RW(, label, btrfs_label_show, btrfs_label_store); static ssize_t btrfs_nodesize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -425,7 +426,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); } -BTRFS_ATTR(nodesize, btrfs_nodesize_show); +BTRFS_ATTR(, nodesize, btrfs_nodesize_show); static ssize_t btrfs_sectorsize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -436,7 +437,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, fs_info->super_copy->sectorsize); } -BTRFS_ATTR(sectorsize, btrfs_sectorsize_show); +BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -447,7 +448,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, fs_info->super_copy->sectorsize); } -BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); +BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); static ssize_t quota_override_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -487,14 +488,14 @@ static ssize_t quota_override_store(struct kobject *kobj, return len; } -BTRFS_ATTR_RW(quota_override, quota_override_show, quota_override_store); +BTRFS_ATTR_RW(, quota_override, quota_override_show, quota_override_store); static const struct attribute *btrfs_attrs[] = { - BTRFS_ATTR_PTR(label), - BTRFS_ATTR_PTR(nodesize), - BTRFS_ATTR_PTR(sectorsize), - BTRFS_ATTR_PTR(clone_alignment), - BTRFS_ATTR_PTR(quota_override), + BTRFS_ATTR_PTR(, label), + BTRFS_ATTR_PTR(, nodesize), + BTRFS_ATTR_PTR(, sectorsize), + BTRFS_ATTR_PTR(, clone_alignment), + BTRFS_ATTR_PTR(, quota_override), NULL, }; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index d7da1a4c2f6c..80457f31c29f 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BTRFS_SYSFS_H_ #define _BTRFS_SYSFS_H_ @@ -20,21 +21,16 @@ enum btrfs_feature_set { .store = _store, \ } -#define BTRFS_ATTR_RW(_name, _show, _store) \ - static struct kobj_attribute btrfs_attr_##_name = \ +#define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) -#define BTRFS_ATTR(_name, _show) \ - static struct kobj_attribute btrfs_attr_##_name = \ +#define BTRFS_ATTR(_prefix, _name, _show) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) -#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) - -#define BTRFS_RAID_ATTR(_name, _show) \ - static struct kobj_attribute btrfs_raid_attr_##_name = \ - __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) - -#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) +#define BTRFS_ATTR_PTR(_prefix, _name) \ + (&btrfs_attr_##_prefix##_##_name.attr) struct btrfs_feature_attr { @@ -43,15 +39,16 @@ struct btrfs_feature_attr { u64 feature_bit; }; -#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \ -static struct btrfs_feature_attr btrfs_attr_##_name = { \ +#define BTRFS_FEAT_ATTR(_name, _feature_set, _feature_prefix, _feature_bit) \ +static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \ btrfs_feature_attr_show, \ btrfs_feature_attr_store), \ .feature_set = _feature_set, \ - .feature_bit = _prefix ##_## _feature_bit, \ + .feature_bit = _feature_prefix ##_## _feature_bit, \ } -#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr) +#define BTRFS_FEAT_ATTR_PTR(_name) \ + (&btrfs_attr_features_##_name.kobj_attr.attr) #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 1458bb0ea124..8444a018cca2 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -500,7 +500,8 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, path = btrfs_alloc_path(); if (!path) { test_msg("Couldn't allocate path\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } ret = add_block_group_free_space(&trans, root->fs_info, cache); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 8c91d03cc82d..f797642c013d 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -770,7 +770,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, 4096 * 1024, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -968,7 +968,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, NULL, 0); if (ret) { @@ -983,7 +982,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, NULL, 0); @@ -1003,7 +1001,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | - EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); @@ -1017,7 +1015,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, @@ -1035,12 +1032,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] - * - * I'm artificially adding 2 to outstanding_extents because in the - * buffered IO case we'd add things up as we go, but I don't feel like - * doing that here, this isn't the interesting case we want to test. */ - BTRFS_I(inode)->outstanding_extents += 2; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, @@ -1059,7 +1051,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); @@ -1079,7 +1070,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); @@ -1096,7 +1087,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) * Refill the hole again just for good measure, because I thought it * might fail and I'd rather satisfy my paranoia at this point. */ - BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); @@ -1114,7 +1104,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* Empty */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); @@ -1131,7 +1121,7 @@ out: if (ret) clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 0f4ce970d195..90204b166643 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -240,7 +240,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -252,7 +253,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -275,7 +277,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -286,7 +289,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -337,7 +341,8 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -349,7 +354,8 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -370,7 +376,8 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -382,7 +389,8 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -409,7 +417,8 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, + false); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); @@ -421,7 +430,8 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, + false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f615d59b0489..5a8c2649af2f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -797,8 +797,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - if (fs_info->global_block_rsv.space_info->full && - btrfs_check_space_for_delayed_refs(trans, fs_info)) + if (btrfs_check_space_for_delayed_refs(trans, fs_info)) return 1; return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); @@ -950,6 +949,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; + atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers); while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { bool wait_writeback = false; @@ -985,6 +985,7 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, cond_resched(); start = end + 1; } + atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers); return werr; } @@ -1915,8 +1916,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { + /* + * We use writeback_inodes_sb here because if we used + * btrfs_start_delalloc_roots we would deadlock with fs freeze. + * Currently are holding the fs freeze lock, if we do an async flush + * we'll do btrfs_join_transaction() and deadlock because we need to + * wait for the fs freeze lock. Using the direct flushing we benefit + * from already being in a transaction and our join_transaction doesn't + * have to re-take the fs freeze lock. + */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) - return btrfs_start_delalloc_roots(fs_info, 1, -1); + writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c new file mode 100644 index 000000000000..114fc5f0ecc5 --- /dev/null +++ b/fs/btrfs/tree-checker.c @@ -0,0 +1,425 @@ +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. + */ + +/* + * The module is used to catch unexpected/corrupted tree block data. + * Such behavior can be caused either by a fuzzed image or bugs. + * + * The objective is to do leaf/node validation checks when tree block is read + * from disk, and check *every* possible member, so other code won't + * need to checking them again. + * + * Due to the potential and unwanted damage, every checker needs to be + * carefully reviewed otherwise so it does not prevent mount of valid images. + */ + +#include "ctree.h" +#include "tree-checker.h" +#include "disk-io.h" +#include "compression.h" + +/* + * Error message should follow the following format: + * corrupt <type>: <identifier>, <reason>[, <bad_value>] + * + * @type: leaf or node + * @identifier: the necessary info to locate the leaf/node. + * It's recommened to decode key.objecitd/offset if it's + * meaningful. + * @reason: describe the error + * @bad_value: optional, it's recommened to output bad value and its + * expected value (range). + * + * Since comma is used to separate the components, only space is allowed + * inside each component. + */ + +/* + * Append generic "corrupt leaf/node root=%llu block=%llu slot=%d: " to @fmt. + * Allows callers to customize the output. + */ +__printf(4, 5) +static void generic_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root->objectid, btrfs_header_bytenr(eb), slot, &vaf); + va_end(args); +} + +/* + * Customized reporter for extent data item, since its key objectid and + * offset has its own meaning. + */ +__printf(4, 5) +static void file_extent_err(const struct btrfs_root *root, + const struct extent_buffer *eb, int slot, + const char *fmt, ...) +{ + struct btrfs_key key; + struct va_format vaf; + va_list args; + + btrfs_item_key_to_cpu(eb, &key, slot); + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + btrfs_crit(root->fs_info, + "corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, %pV", + btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid, + btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf); + va_end(args); +} + +/* + * Return 0 if the btrfs_file_extent_##name is aligned to @alignment + * Else return 1 + */ +#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment) \ +({ \ + if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \ + file_extent_err((root), (leaf), (slot), \ + "invalid %s for file extent, have %llu, should be aligned to %u", \ + (#name), btrfs_file_extent_##name((leaf), (fi)), \ + (alignment)); \ + (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))); \ +}) + +static int check_extent_data_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_file_extent_item *fi; + u32 sectorsize = root->fs_info->sectorsize; + u32 item_size = btrfs_item_size_nr(leaf, slot); + + if (!IS_ALIGNED(key->offset, sectorsize)) { + file_extent_err(root, leaf, slot, +"unaligned file_offset for file extent, have %llu should be aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) > BTRFS_FILE_EXTENT_TYPES) { + file_extent_err(root, leaf, slot, + "invalid type for file extent, have %u expect range [0, %u]", + btrfs_file_extent_type(leaf, fi), + BTRFS_FILE_EXTENT_TYPES); + return -EUCLEAN; + } + + /* + * Support for new compression/encrption must introduce incompat flag, + * and must be caught in open_ctree(). + */ + if (btrfs_file_extent_compression(leaf, fi) > BTRFS_COMPRESS_TYPES) { + file_extent_err(root, leaf, slot, + "invalid compression for file extent, have %u expect range [0, %u]", + btrfs_file_extent_compression(leaf, fi), + BTRFS_COMPRESS_TYPES); + return -EUCLEAN; + } + if (btrfs_file_extent_encryption(leaf, fi)) { + file_extent_err(root, leaf, slot, + "invalid encryption for file extent, have %u expect 0", + btrfs_file_extent_encryption(leaf, fi)); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + /* Inline extent must have 0 as key offset */ + if (key->offset) { + file_extent_err(root, leaf, slot, + "invalid file_offset for inline file extent, have %llu expect 0", + key->offset); + return -EUCLEAN; + } + + /* Compressed inline extent has no on-disk size, skip it */ + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE) + return 0; + + /* Uncompressed inline extent size must match item size */ + if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)) { + file_extent_err(root, leaf, slot, + "invalid ram_bytes for uncompressed inline extent, have %u expect %llu", + item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)); + return -EUCLEAN; + } + return 0; + } + + /* Regular or preallocated extent has fixed item size */ + if (item_size != sizeof(*fi)) { + file_extent_err(root, leaf, slot, + "invalid item size for reg/prealloc file extent, have %u expect %zu", + item_size, sizeof(*fi)); + return -EUCLEAN; + } + if (CHECK_FE_ALIGNED(root, leaf, slot, fi, ram_bytes, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_bytenr, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, disk_num_bytes, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, offset, sectorsize) || + CHECK_FE_ALIGNED(root, leaf, slot, fi, num_bytes, sectorsize)) + return -EUCLEAN; + return 0; +} + +static int check_csum_item(struct btrfs_root *root, struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + u32 sectorsize = root->fs_info->sectorsize; + u32 csumsize = btrfs_super_csum_size(root->fs_info->super_copy); + + if (key->objectid != BTRFS_EXTENT_CSUM_OBJECTID) { + generic_err(root, leaf, slot, + "invalid key objectid for csum item, have %llu expect %llu", + key->objectid, BTRFS_EXTENT_CSUM_OBJECTID); + return -EUCLEAN; + } + if (!IS_ALIGNED(key->offset, sectorsize)) { + generic_err(root, leaf, slot, + "unaligned key offset for csum item, have %llu should be aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + if (!IS_ALIGNED(btrfs_item_size_nr(leaf, slot), csumsize)) { + generic_err(root, leaf, slot, + "unaligned item size for csum item, have %u should be aligned to %u", + btrfs_item_size_nr(leaf, slot), csumsize); + return -EUCLEAN; + } + return 0; +} + +/* + * Common point to switch the item-specific validation. + */ +static int check_leaf_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + int ret = 0; + + switch (key->type) { + case BTRFS_EXTENT_DATA_KEY: + ret = check_extent_data_item(root, leaf, key, slot); + break; + case BTRFS_EXTENT_CSUM_KEY: + ret = check_csum_item(root, leaf, key, slot); + break; + } + return ret; +} + +int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + /* No valid key type is 0, so all key should be larger than this key */ + struct btrfs_key prev_key = {0, 0, 0}; + struct btrfs_key key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + /* + * Extent buffers from a relocation tree have a owner field that + * corresponds to the subvolume tree they are based on. So just from an + * extent buffer alone we can not find out what is the id of the + * corresponding subvolume tree, so we can not figure out if the extent + * buffer corresponds to the root of the relocation tree or not. So + * skip this check for relocation trees. + */ + if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { + struct btrfs_root *check_root; + + key.objectid = btrfs_header_owner(leaf); + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + check_root = btrfs_get_fs_root(fs_info, &key, false); + /* + * The only reason we also check NULL here is that during + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { + struct extent_buffer *eb; + + eb = btrfs_root_node(check_root); + /* if leaf is the root, then it's fine */ + if (leaf != eb) { + generic_err(check_root, leaf, 0, + "invalid nritems, have %u should not be 0 for non-root leaf", + nritems); + free_extent_buffer(eb); + return -EUCLEAN; + } + free_extent_buffer(eb); + } + return 0; + } + + if (nritems == 0) + return 0; + + /* + * Check the following things to make sure this is a good leaf, and + * leaf users won't need to bother with similar sanity checks: + * + * 1) key ordering + * 2) item offset and size + * No overlap, no hole, all inside the leaf. + * 3) item content + * If possible, do comprehensive sanity check. + * NOTE: All checks must only rely on the item data itself. + */ + for (slot = 0; slot < nritems; slot++) { + u32 item_end_expected; + int ret; + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&prev_key, &key) >= 0) { + generic_err(root, leaf, slot, + "bad key order, prev (%llu %u %llu) current (%llu %u %llu)", + prev_key.objectid, prev_key.type, + prev_key.offset, key.objectid, key.type, + key.offset); + return -EUCLEAN; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (slot == 0) + item_end_expected = BTRFS_LEAF_DATA_SIZE(fs_info); + else + item_end_expected = btrfs_item_offset_nr(leaf, + slot - 1); + if (btrfs_item_end_nr(leaf, slot) != item_end_expected) { + generic_err(root, leaf, slot, + "unexpected item end, have %u expect %u", + btrfs_item_end_nr(leaf, slot), + item_end_expected); + return -EUCLEAN; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just in case all the items are consistent to each other, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(fs_info)) { + generic_err(root, leaf, slot, + "slot end outside of leaf, have %u expect range [0, %u]", + btrfs_item_end_nr(leaf, slot), + BTRFS_LEAF_DATA_SIZE(fs_info)); + return -EUCLEAN; + } + + /* Also check if the item pointer overlaps with btrfs item. */ + if (btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item) > + btrfs_item_ptr_offset(leaf, slot)) { + generic_err(root, leaf, slot, + "slot overlaps with its data, item end %lu data start %lu", + btrfs_item_nr_offset(slot) + + sizeof(struct btrfs_item), + btrfs_item_ptr_offset(leaf, slot)); + return -EUCLEAN; + } + + /* Check if the item size and content meet other criteria */ + ret = check_leaf_item(root, leaf, &key, slot); + if (ret < 0) + return ret; + + prev_key.objectid = key.objectid; + prev_key.type = key.type; + prev_key.offset = key.offset; + } + + return 0; +} + +int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) +{ + unsigned long nr = btrfs_header_nritems(node); + struct btrfs_key key, next_key; + int slot; + u64 bytenr; + int ret = 0; + + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { + btrfs_crit(root->fs_info, +"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]", + root->objectid, node->start, + nr == 0 ? "small" : "large", nr, + BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)); + return -EUCLEAN; + } + + for (slot = 0; slot < nr - 1; slot++) { + bytenr = btrfs_node_blockptr(node, slot); + btrfs_node_key_to_cpu(node, &key, slot); + btrfs_node_key_to_cpu(node, &next_key, slot + 1); + + if (!bytenr) { + generic_err(root, node, slot, + "invalid NULL node pointer"); + ret = -EUCLEAN; + goto out; + } + if (!IS_ALIGNED(bytenr, root->fs_info->sectorsize)) { + generic_err(root, node, slot, + "unaligned pointer, have %llu should be aligned to %u", + bytenr, root->fs_info->sectorsize); + ret = -EUCLEAN; + goto out; + } + + if (btrfs_comp_cpu_keys(&key, &next_key) >= 0) { + generic_err(root, node, slot, + "bad key order, current (%llu %u %llu) next (%llu %u %llu)", + key.objectid, key.type, key.offset, + next_key.objectid, next_key.type, + next_key.offset); + ret = -EUCLEAN; + goto out; + } + } +out: + return ret; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h new file mode 100644 index 000000000000..96c486e95d70 --- /dev/null +++ b/fs/btrfs/tree-checker.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) Qu Wenruo 2017. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program. + */ + +#ifndef __BTRFS_TREE_CHECKER__ +#define __BTRFS_TREE_CHECKER__ + +#include "ctree.h" +#include "extent_io.h" + +int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node); + +#endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ad7f4bab640b..aa7c71cff575 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -717,7 +717,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); if (ret == 0) { - ret = btrfs_inc_extent_ref(trans, fs_info, + ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, key->objectid, offset); @@ -2699,34 +2699,36 @@ static void wait_log_commit(struct btrfs_root *root, int transid) * so we know that if ours is more than 2 older than the * current transaction, we're done */ - do { + for (;;) { prepare_to_wait(&root->log_commit_wait[index], &wait, TASK_UNINTERRUPTIBLE); - mutex_unlock(&root->log_mutex); - if (root->log_transid_committed < transid && - atomic_read(&root->log_commit[index])) - schedule(); + if (!(root->log_transid_committed < transid && + atomic_read(&root->log_commit[index]))) + break; - finish_wait(&root->log_commit_wait[index], &wait); + mutex_unlock(&root->log_mutex); + schedule(); mutex_lock(&root->log_mutex); - } while (root->log_transid_committed < transid && - atomic_read(&root->log_commit[index])); + } + finish_wait(&root->log_commit_wait[index], &wait); } static void wait_for_writer(struct btrfs_root *root) { DEFINE_WAIT(wait); - while (atomic_read(&root->log_writers)) { - prepare_to_wait(&root->log_writer_wait, - &wait, TASK_UNINTERRUPTIBLE); + for (;;) { + prepare_to_wait(&root->log_writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (!atomic_read(&root->log_writers)) + break; + mutex_unlock(&root->log_mutex); - if (atomic_read(&root->log_writers)) - schedule(); - finish_wait(&root->log_writer_wait, &wait); + schedule(); mutex_lock(&root->log_mutex); } + finish_wait(&root->log_writer_wait, &wait); } static inline void btrfs_remove_log_ctx(struct btrfs_root *root, @@ -4181,6 +4183,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; + u64 logged_start, logged_end; u64 test_gen; int ret = 0; int num = 0; @@ -4190,10 +4193,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, down_write(&inode->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; + logged_start = start; + logged_end = end; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { list_del_init(&em->list); - /* * Just an arbitrary number, this can be really CPU intensive * once we start getting a lot of extents, and really once we @@ -4208,6 +4212,12 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; + + if (em->start < logged_start) + logged_start = em->start; + if ((em->start + em->len - 1) > logged_end) + logged_end = em->start + em->len - 1; + /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); @@ -4216,7 +4226,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - btrfs_get_logged_extents(inode, logged_list, start, end); + btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end); /* * Some ordered extents started by fsync might have completed * before we could collect them into the list logged_list, which @@ -4637,7 +4647,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; - struct extent_buffer *src = NULL; LIST_HEAD(logged_list); u64 last_extent = 0; int err = 0; @@ -4880,7 +4889,6 @@ again: goto next_slot; } - src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; goto next_slot; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0e8f16c305df..f1ecb938ba4d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -360,7 +360,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) int again = 0; unsigned long num_run; unsigned long batch_run = 0; - unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; int sync_pending = 0; @@ -375,8 +374,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) blk_start_plug(&plug); bdi = device->bdev->bd_bdi; - limit = btrfs_async_submit_limit(fs_info); - limit = limit * 2 / 3; loop: spin_lock(&device->io_lock); @@ -443,13 +440,6 @@ loop_lock: pending = pending->bi_next; cur->bi_next = NULL; - /* - * atomic_dec_return implies a barrier for waitqueue_active - */ - if (atomic_dec_return(&fs_info->nr_async_bios) < limit && - waitqueue_active(&fs_info->async_submit_wait)) - wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* @@ -517,12 +507,6 @@ loop_lock: &device->work); goto done; } - /* unplug every 64 requests just for good measure */ - if (batch_run % 64 == 0) { - blk_finish_plug(&plug); - blk_start_plug(&plug); - sync_pending = 0; - } } cond_resched(); @@ -547,7 +531,7 @@ static void pending_bios_fn(struct btrfs_work *work) } -void btrfs_free_stale_device(struct btrfs_device *cur_dev) +static void btrfs_free_stale_device(struct btrfs_device *cur_dev) { struct btrfs_fs_devices *fs_devs; struct btrfs_device *dev; @@ -1068,14 +1052,15 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } -void btrfs_release_disk_super(struct page *page) +static void btrfs_release_disk_super(struct page *page) { kunmap(page); put_page(page); } -int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, - struct page **page, struct btrfs_super_block **disk_super) +static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, + struct page **page, + struct btrfs_super_block **disk_super) { void *p; pgoff_t index; @@ -1817,8 +1802,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, return 0; } -struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, - struct btrfs_device *device) +static struct btrfs_device * btrfs_find_next_active_device( + struct btrfs_fs_devices *fs_devs, struct btrfs_device *device) { struct btrfs_device *next_device; @@ -2031,19 +2016,20 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, } btrfs_close_bdev(srcdev); - call_rcu(&srcdev->rcu, free_device); - /* - * unless fs_devices is seed fs, num_devices shouldn't go - * zero - */ - BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); - /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { struct btrfs_fs_devices *tmp_fs_devices; + /* + * On a mounted FS, num_devices can't be zero unless it's a + * seed. In case of a seed device being replaced, the replace + * target added to the sprout FS, so there will be no more + * device left under the seed FS. + */ + ASSERT(fs_devices->seeding); + tmp_fs_devices = fs_info->fs_devices; while (tmp_fs_devices) { if (tmp_fs_devices->seed == fs_devices) { @@ -2323,6 +2309,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 tmp; int seeding_dev = 0; int ret = 0; + bool unlocked = false; if (sb_rdonly(sb) && !fs_info->fs_devices->seeding) return -EROFS; @@ -2399,7 +2386,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; ret = btrfs_prepare_sprout(fs_info); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + goto error_trans; + } } device->fs_devices = fs_info->fs_devices; @@ -2445,14 +2435,14 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } } ret = btrfs_add_device(trans, fs_info, device); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } if (seeding_dev) { @@ -2461,7 +2451,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path ret = btrfs_finish_sprout(trans, fs_info); if (ret) { btrfs_abort_transaction(trans, ret); - goto error_trans; + goto error_sysfs; } /* Sprouting would change fsid of the mounted root, @@ -2479,6 +2469,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); + unlocked = true; if (ret) /* transaction commit */ return ret; @@ -2491,7 +2482,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (IS_ERR(trans)) { if (PTR_ERR(trans) == -ENOENT) return 0; - return PTR_ERR(trans); + ret = PTR_ERR(trans); + trans = NULL; + goto error_sysfs; } ret = btrfs_commit_transaction(trans); } @@ -2500,14 +2493,18 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path update_dev_time(device_path); return ret; +error_sysfs: + btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); error_trans: - btrfs_end_transaction(trans); + if (seeding_dev) + sb->s_flags |= MS_RDONLY; + if (trans) + btrfs_end_transaction(trans); rcu_string_free(device->name); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev) { + if (seeding_dev && !unlocked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } @@ -4813,16 +4810,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em_tree = &info->mapping_tree.map_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); - if (!ret) { - list_add_tail(&em->list, &trans->transaction->pending_chunks); - refcount_inc(&em->refs); - } - write_unlock(&em_tree->lock); if (ret) { + write_unlock(&em_tree->lock); free_extent_map(em); goto error; } + list_add_tail(&em->list, &trans->transaction->pending_chunks); + refcount_inc(&em->refs); + write_unlock(&em_tree->lock); + ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); if (ret) goto error_del_extent; @@ -5695,10 +5692,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) + if (!need_full_stripe(op)) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5711,7 +5708,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { + if (need_full_stripe(op)) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5725,7 +5722,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) num_stripes = map->sub_stripes; else if (mirror_num) stripe_index += mirror_num - 1; @@ -5740,9 +5737,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || - mirror_num > 1)) { + if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, stripe_len * nr_data_stripes(map)); @@ -5769,9 +5764,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != BTRFS_MAP_WRITE && - op != BTRFS_MAP_GET_READ_MIRRORS) && - mirror_num <= 1) + if (!need_full_stripe(op) && mirror_num <= 1) mirror_num = 1; } } else { @@ -6033,7 +6026,7 @@ static void btrfs_end_bio(struct bio *bio) * this bio is actually up to date, we didn't * go over the max number of errors */ - bio->bi_status = 0; + bio->bi_status = BLK_STS_OK; } btrfs_end_bbio(bbio, bio); @@ -6069,13 +6062,6 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device, return; } - /* - * nr_async_bios allows us to reliably return congestion to the - * higher layers. Otherwise, the async bio makes it appear we have - * made progress against dirty pages when we've really just put it - * on a queue for later - */ - atomic_inc(&fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; @@ -6144,7 +6130,10 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - bio->bi_status = BLK_STS_IOERR; + if (atomic_read(&bbio->error) > bbio->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; btrfs_end_bbio(bbio, bio); } } @@ -6166,7 +6155,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, map_length = length; btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, bio_op(bio), logical, + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); @@ -6249,7 +6238,7 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, device = btrfs_alloc_device(NULL, &devid, dev_uuid); if (IS_ERR(device)) - return NULL; + return device; list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; @@ -6377,6 +6366,17 @@ static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, return 0; } +static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, + u64 devid, u8 *uuid, bool error) +{ + if (error) + btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); + else + btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", + devid, uuid); +} + static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) @@ -6447,18 +6447,21 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); - btrfs_report_missing_device(fs_info, devid, uuid); - return -EIO; + btrfs_report_missing_device(fs_info, devid, uuid, true); + return -ENOENT; } if (!map->stripes[i].dev) { map->stripes[i].dev = add_missing_dev(fs_info->fs_devices, devid, uuid); - if (!map->stripes[i].dev) { + if (IS_ERR(map->stripes[i].dev)) { free_extent_map(em); - return -EIO; + btrfs_err(fs_info, + "failed to init missing dev %llu: %ld", + devid, PTR_ERR(map->stripes[i].dev)); + return PTR_ERR(map->stripes[i].dev); } - btrfs_report_missing_device(fs_info, devid, uuid); + btrfs_report_missing_device(fs_info, devid, uuid, false); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6577,19 +6580,28 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { - btrfs_report_missing_device(fs_info, devid, dev_uuid); - return -EIO; + btrfs_report_missing_device(fs_info, devid, + dev_uuid, true); + return -ENOENT; } device = add_missing_dev(fs_devices, devid, dev_uuid); - if (!device) - return -ENOMEM; - btrfs_report_missing_device(fs_info, devid, dev_uuid); + if (IS_ERR(device)) { + btrfs_err(fs_info, + "failed to add missing dev %llu: %ld", + devid, PTR_ERR(device)); + return PTR_ERR(device); + } + btrfs_report_missing_device(fs_info, devid, dev_uuid, false); } else { if (!device->bdev) { - btrfs_report_missing_device(fs_info, devid, dev_uuid); - if (!btrfs_test_opt(fs_info, DEGRADED)) - return -EIO; + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, + devid, dev_uuid, true); + return -ENOENT; + } + btrfs_report_missing_device(fs_info, devid, + dev_uuid, false); } if(!device->bdev && !device->missing) { @@ -6756,12 +6768,6 @@ out_short_read: return -EIO; } -void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid) -{ - btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); -} - /* * Check if all chunks in the fs are OK for read-write degraded mount * diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6108fdfec67f..ff15208344a7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -542,7 +542,5 @@ void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info); -void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, - u8 *uuid); #endif diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c248f9286366..2b52950dc2c6 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -37,6 +37,7 @@ struct workspace { z_stream strm; char *buf; struct list_head list; + int level; }; static void zlib_free_workspace(struct list_head *ws) @@ -96,7 +97,7 @@ static int zlib_compress_pages(struct list_head *ws, *total_out = 0; *total_in = 0; - if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) { + if (Z_OK != zlib_deflateInit(&workspace->strm, workspace->level)) { pr_warn("BTRFS: deflateInit failed\n"); ret = -EIO; goto out; @@ -402,10 +403,22 @@ next: return ret; } +static void zlib_set_level(struct list_head *ws, unsigned int type) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + unsigned level = (type & 0xF0) >> 4; + + if (level > 9) + level = 9; + + workspace->level = level > 0 ? level : 3; +} + const struct btrfs_compress_op btrfs_zlib_compress = { .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, .decompress_bio = zlib_decompress_bio, .decompress = zlib_decompress, + .set_level = zlib_set_level, }; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 607ce47b483a..17f2dd8fddb8 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -423,10 +423,15 @@ finish: return ret; } +static void zstd_set_level(struct list_head *ws, unsigned int type) +{ +} + const struct btrfs_compress_op btrfs_zstd_compress = { .alloc_workspace = zstd_alloc_workspace, .free_workspace = zstd_free_workspace, .compress_pages = zstd_compress_pages, .decompress_bio = zstd_decompress_bio, .decompress = zstd_decompress, + .set_level = zstd_set_level, }; |