diff options
Diffstat (limited to 'fs/btrfs')
65 files changed, 4939 insertions, 3879 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 67a607709d4f..53bb7af4e5f0 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,8 +55,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) } if (size > 0) { acl = posix_acl_from_xattr(&init_user_ns, value, size); - } else if (size == -ENOENT || size == -ENODATA || size == 0) { - /* FIXME, who returns -ENOENT? I think nobody */ + } else if (size == -ERANGE || size == -ENODATA || size == 0) { acl = NULL; } else { acl = ERR_PTR(-EIO); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5fb60ea7eee2..e0f071f6b5a7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -34,6 +34,10 @@ struct __btrfs_workqueue { struct workqueue_struct *normal_wq; + + /* File system this workqueue services */ + struct btrfs_fs_info *fs_info; + /* List head pointing to ordered work list */ struct list_head ordered_list; @@ -70,6 +74,18 @@ void btrfs_##name(struct work_struct *arg) \ normal_work_helper(work); \ } +struct btrfs_fs_info * +btrfs_workqueue_owner(struct __btrfs_workqueue *wq) +{ + return wq->fs_info; +} + +struct btrfs_fs_info * +btrfs_work_owner(struct btrfs_work *work) +{ + return work->wq->fs_info; +} + BTRFS_WORK_HELPER(worker_helper); BTRFS_WORK_HELPER(delalloc_helper); BTRFS_WORK_HELPER(flush_delalloc_helper); @@ -94,14 +110,15 @@ BTRFS_WORK_HELPER(scrubnc_helper); BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, - int thresh) +__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, + unsigned int flags, int limit_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; + ret->fs_info = fs_info; ret->limit_active = limit_active; atomic_set(&ret->pending, 0); if (thresh == 0) @@ -143,7 +160,8 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, static inline void __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh) @@ -153,7 +171,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, if (!ret) return NULL; - ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, + ret->normal = __btrfs_alloc_workqueue(fs_info, name, + flags & ~WQ_HIGHPRI, limit_active, thresh); if (!ret->normal) { kfree(ret); @@ -161,8 +180,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, } if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(name, flags, limit_active, - thresh); + ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, + limit_active, thresh); if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); kfree(ret); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ad4d0647d1a6..8e52484cd461 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -21,6 +21,7 @@ #define __BTRFS_ASYNC_THREAD_ #include <linux/workqueue.h> +struct btrfs_fs_info; struct btrfs_workqueue; /* Internal use only */ struct __btrfs_workqueue; @@ -67,7 +68,8 @@ BTRFS_WORK_HELPER_PROTO(scrubnc_helper); BTRFS_WORK_HELPER_PROTO(scrubparity_helper); -struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, int limit_active, int thresh); @@ -80,4 +82,6 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work); +struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 80e8472d618b..455a6b2fd539 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -139,7 +139,7 @@ int __init btrfs_prelim_ref_init(void) btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", sizeof(struct __prelim_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_prelim_ref_cache) return -ENOMEM; @@ -361,7 +361,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (btrfs_test_is_dummy_root(root)) { + if (btrfs_is_testing(fs_info)) { srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -ENOENT; goto out; @@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode) list_del(&ref2->list); kmem_cache_free(btrfs_prelim_ref_cache, ref2); + cond_resched(); } } @@ -1939,7 +1940,7 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, * from ipath->fspath->val[i]. * when it returns, there are ipath->fspath->elem_cnt number of paths available * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the - * number of missed paths in recored in ipath->fspath->elem_missed, otherwise, + * number of missed paths is recorded in ipath->fspath->elem_missed, otherwise, * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would * have been needed to return all paths. */ @@ -1991,7 +1992,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, ifp = kmalloc(sizeof(*ifp), GFP_NOFS); if (!ifp) { - kfree(fspath); + vfree(fspath); return ERR_PTR(-ENOMEM); } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 61205e3bbefa..4919aedb5fc1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -196,6 +196,16 @@ struct btrfs_inode { struct list_head delayed_iput; long delayed_iput_count; + /* + * To avoid races between lockless (i_mutex not held) direct IO writes + * and concurrent fsync requests. Direct IO writes must acquire read + * access on this semaphore for creating an extent map and its + * corresponding ordered extent. The fast fsync path must acquire write + * access on this semaphore before it collects ordered extents and + * extent maps. + */ + struct rw_semaphore dio_sem; + struct inode vfs_inode; }; @@ -303,7 +313,7 @@ struct btrfs_dio_private { struct bio *dio_bio; /* - * The original bio may be splited to several sub-bios, this is + * The original bio may be split to several sub-bios, this is * done during endio of sub-bios */ int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 516e19d1d202..66789471b49d 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1673,6 +1673,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, } bio->bi_bdev = block_ctx->dev->bdev; bio->bi_iter.bi_sector = dev_bytenr >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -1685,7 +1686,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, "btrfsic: error, failed to add a single page!\n"); return -1; } - if (submit_bio_wait(READ, bio)) { + if (submit_bio_wait(bio)) { printk(KERN_INFO "btrfsic: read error at logical %llu dev %s!\n", block_ctx->start, block_ctx->dev->name); @@ -1939,7 +1940,7 @@ again: /* * Clear all references of this block. Do not free * the block itself even if is not referenced anymore - * because it still carries valueable information + * because it still carries valuable information * like whether it was ever written and IO completed. */ list_for_each_entry_safe(l, tmp, &block->ref_to_list, @@ -2206,7 +2207,7 @@ static void btrfsic_bio_end_io(struct bio *bp) block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2242,7 +2243,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate) block->dev_bytenr, block->mirror_num); block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_FLUSH) { + if (block->submit_bio_bh_rw & REQ_PREFLUSH) { dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) @@ -2645,7 +2646,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)", + indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)", btrfsic_get_block_type(state, block), block->logical_bytenr, block->dev_state->name, block->dev_bytenr, block->mirror_num); @@ -2855,12 +2856,12 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup( return ds; } -int btrfsic_submit_bh(int rw, struct buffer_head *bh) +int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) { struct btrfsic_dev_state *dev_state; if (!btrfsic_is_initialized) - return submit_bh(rw, bh); + return submit_bh(op, op_flags, bh); mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bh() might also be called before @@ -2869,26 +2870,26 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) /* Only called to write the superblock (incl. FLUSH/FUA) */ if (NULL != dev_state && - (rw & WRITE) && bh->b_size > 0) { + (op == REQ_OP_WRITE) && bh->b_size > 0) { u64 dev_bytenr; dev_bytenr = 4096 * bh->b_blocknr; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu)," - " size=%zu, data=%p, bdev=%p)\n", - rw, (unsigned long long)bh->b_blocknr, + "submit_bh(op=0x%x,0x%x, blocknr=%llu " + "(bytenr %llu), size=%zu, data=%p, bdev=%p)\n", + op, op_flags, (unsigned long long)bh->b_blocknr, dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev); btrfsic_process_written_block(dev_state, dev_bytenr, &bh->b_data, 1, NULL, - NULL, bh, rw); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + NULL, bh, op_flags); + } else if (NULL != dev_state && (op_flags & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bh(rw=0x%x FLUSH, bdev=%p)\n", - rw, bh->b_bdev); + "submit_bh(op=0x%x,0x%x FLUSH, bdev=%p)\n", + op, op_flags, bh->b_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -2906,7 +2907,7 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; + block->submit_bio_bh_rw = op_flags; block->orig_bio_bh_private = bh->b_private; block->orig_bio_bh_end_io.bh = bh->b_end_io; block->next_in_same_bio = NULL; @@ -2915,10 +2916,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh) } } mutex_unlock(&btrfsic_mutex); - return submit_bh(rw, bh); + return submit_bh(op, op_flags, bh); } -static void __btrfsic_submit_bio(int rw, struct bio *bio) +static void __btrfsic_submit_bio(struct bio *bio) { struct btrfsic_dev_state *dev_state; @@ -2930,7 +2931,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) * btrfsic_mount(), this might return NULL */ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && - (rw & WRITE) && NULL != bio->bi_io_vec) { + (bio_op(bio) == REQ_OP_WRITE) && NULL != bio->bi_io_vec) { unsigned int i; u64 dev_bytenr; u64 cur_bytenr; @@ -2942,9 +2943,9 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x, bi_vcnt=%u," + "submit_bio(rw=%d,0x%x, bi_vcnt=%u," " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - rw, bio->bi_vcnt, + bio_op(bio), bio->bi_opf, bio->bi_vcnt, (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); @@ -2975,18 +2976,18 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, bio, &bio_is_patched, - NULL, rw); + NULL, bio->bi_opf); while (i > 0) { i--; kunmap(bio->bi_io_vec[i].bv_page); } kfree(mapped_datav); - } else if (NULL != dev_state && (rw & REQ_FLUSH)) { + } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) printk(KERN_INFO - "submit_bio(rw=0x%x FLUSH, bdev=%p)\n", - rw, bio->bi_bdev); + "submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -3004,7 +3005,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) block->never_written = 0; block->iodone_w_error = 0; block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = rw; + block->submit_bio_bh_rw = bio->bi_opf; block->orig_bio_bh_private = bio->bi_private; block->orig_bio_bh_end_io.bio = bio->bi_end_io; block->next_in_same_bio = NULL; @@ -3016,16 +3017,16 @@ leave: mutex_unlock(&btrfsic_mutex); } -void btrfsic_submit_bio(int rw, struct bio *bio) +void btrfsic_submit_bio(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - submit_bio(rw, bio); + __btrfsic_submit_bio(bio); + submit_bio(bio); } -int btrfsic_submit_bio_wait(int rw, struct bio *bio) +int btrfsic_submit_bio_wait(struct bio *bio) { - __btrfsic_submit_bio(rw, bio); - return submit_bio_wait(rw, bio); + __btrfsic_submit_bio(bio); + return submit_bio_wait(bio); } int btrfsic_mount(struct btrfs_root *root, diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index 13b8566c97ab..f78dff1c7e86 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -20,9 +20,9 @@ #define __BTRFS_CHECK_INTEGRITY__ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -int btrfsic_submit_bh(int rw, struct buffer_head *bh); -void btrfsic_submit_bio(int rw, struct bio *bio); -int btrfsic_submit_bio_wait(int rw, struct bio *bio); +int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh); +void btrfsic_submit_bio(struct bio *bio); +int btrfsic_submit_bio_wait(struct bio *bio); #else #define btrfsic_submit_bh submit_bh #define btrfsic_submit_bio submit_bio diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ff61a41ac90b..029db6e1105c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -363,6 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, kfree(cb); return -ENOMEM; } + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; atomic_inc(&cb->pending_bios); @@ -373,7 +374,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, page = compressed_pages[pg_index]; page->mapping = inode->i_mapping; if (bio->bi_iter.bi_size) - ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, + ret = io_tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, bio, 0); else @@ -401,13 +402,17 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_map_bio(root, bio, 0, 1); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); BUG_ON(!bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -431,8 +436,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, WRITE, bio, 0, 1); - BUG_ON(ret); /* -ENOMEM */ + ret = btrfs_map_bio(root, bio, 0, 1); + if (ret) { + bio->bi_error = ret; + bio_endio(bio); + } bio_put(bio); return 0; @@ -646,6 +654,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); if (!comp_bio) goto fail2; + bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; atomic_inc(&cb->pending_bios); @@ -656,7 +665,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, page->index = em_start >> PAGE_SHIFT; if (comp_bio->bi_iter.bi_size) - ret = tree->ops->merge_bio_hook(READ, page, 0, + ret = tree->ops->merge_bio_hook(page, 0, PAGE_SIZE, comp_bio, 0); else @@ -687,8 +696,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, root->sectorsize); - ret = btrfs_map_bio(root, READ, comp_bio, - mirror_num, 0); + ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { bio->bi_error = ret; bio_endio(comp_bio); @@ -699,6 +707,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); BUG_ON(!comp_bio); + bio_set_op_attrs(comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; @@ -717,7 +726,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { bio->bi_error = ret; bio_endio(comp_bio); @@ -743,8 +752,11 @@ out: static struct { struct list_head idle_ws; spinlock_t ws_lock; - int num_ws; - atomic_t alloc_ws; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ wait_queue_head_t ws_wait; } btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; @@ -758,16 +770,34 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { + struct list_head *workspace; + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); spin_lock_init(&btrfs_comp_ws[i].ws_lock); - atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + atomic_set(&btrfs_comp_ws[i].total_ws, 0); init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); + + /* + * Preallocate one workspace for each compression type so + * we can guarantee forward progress in the worst case + */ + workspace = btrfs_compress_op[i]->alloc_workspace(); + if (IS_ERR(workspace)) { + printk(KERN_WARNING + "BTRFS: cannot preallocate compression workspace, will try later"); + } else { + atomic_set(&btrfs_comp_ws[i].total_ws, 1); + btrfs_comp_ws[i].free_ws = 1; + list_add(workspace, &btrfs_comp_ws[i].idle_ws); + } } } /* - * this finds an available workspace or allocates a new one - * ERR_PTR is returned if things go bad. + * This finds an available workspace or allocates a new one. + * If it's not possible to allocate a new one, waits until there's one. + * Preallocation makes a forward progress guarantees and we do not return + * errors. */ static struct list_head *find_workspace(int type) { @@ -777,36 +807,58 @@ static struct list_head *find_workspace(int type) struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; again: spin_lock(ws_lock); if (!list_empty(idle_ws)) { workspace = idle_ws->next; list_del(workspace); - (*num_ws)--; + (*free_ws)--; spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_ws) > cpus) { + if (atomic_read(total_ws) > cpus) { DEFINE_WAIT(wait); spin_unlock(ws_lock); prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_ws) > cpus && !*num_ws) + if (atomic_read(total_ws) > cpus && !*free_ws) schedule(); finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_ws); + atomic_inc(total_ws); spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake_up(ws_wait); + + /* + * Do not return the error but go back to waiting. There's a + * workspace preallocated for each type and the compression + * time is bounded so we get to a workspace eventually. This + * makes our caller's life easier. + * + * To prevent silent and low-probability deadlocks (when the + * initial preallocation fails), check if there are any + * workspaces at all. + */ + if (atomic_read(total_ws) == 0) { + static DEFINE_RATELIMIT_STATE(_rs, + /* once per minute */ 60 * HZ, + /* no burst */ 1); + + if (__ratelimit(&_rs)) { + printk(KERN_WARNING + "no compression workspaces, low memory, retrying"); + } + } + goto again; } return workspace; } @@ -820,21 +872,21 @@ static void free_workspace(int type, struct list_head *workspace) int idx = type - 1; struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; - atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws; wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; - int *num_ws = &btrfs_comp_ws[idx].num_ws; + int *free_ws = &btrfs_comp_ws[idx].free_ws; spin_lock(ws_lock); - if (*num_ws < num_online_cpus()) { + if (*free_ws < num_online_cpus()) { list_add(workspace, idle_ws); - (*num_ws)++; + (*free_ws)++; spin_unlock(ws_lock); goto wake; } spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_ws); + atomic_dec(total_ws); wake: /* * Make sure counter is updated before we wake up waiters. @@ -857,7 +909,7 @@ static void free_workspaces(void) workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&btrfs_comp_ws[i].alloc_ws); + atomic_dec(&btrfs_comp_ws[i].total_ws); } } } @@ -894,8 +946,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, start, len, pages, @@ -930,8 +980,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, disk_start, @@ -952,8 +1000,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, int ret; workspace = find_workspace(type); - if (IS_ERR(workspace)) - return PTR_ERR(workspace); ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, dest_page, start_byte, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ec7928a27aaa..d1c56c94dd5a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -156,7 +156,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root) /* * RCU really hurts here, we could free up the root node because - * it was cow'ed but we may not get the new root node yet so do + * it was COWed but we may not get the new root node yet so do * the inc_not_zero dance and if it doesn't work then * synchronize_rcu and try again. */ @@ -955,7 +955,7 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf) { /* - * Tree blocks not in refernece counted trees and tree roots + * Tree blocks not in reference counted trees and tree roots * are never shared. If a block was allocated after the last * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. @@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } } else { @@ -1153,14 +1153,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1198,7 +1198,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(root->fs_info, buf); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -1270,7 +1270,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info, /* * tm is a pointer to the first operation to rewind within eb. then, all - * previous operations will be rewinded (until we reach something older than + * previous operations will be rewound (until we reach something older than * time_seq). */ static void @@ -1345,7 +1345,7 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, } /* - * Called with eb read locked. If the buffer cannot be rewinded, the same buffer + * Called with eb read locked. If the buffer cannot be rewound, the same buffer * is returned. If rewind operations happen, a fresh buffer is returned. The * returned buffer is always read-locked. If the returned buffer is not the * input buffer, the lock on the input buffer is released and the input buffer @@ -1373,7 +1373,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start, + eb->len); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1454,7 +1455,8 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(root->fs_info, logical); + eb = alloc_dummy_extent_buffer(root->fs_info, logical, + root->nodesize); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); @@ -1503,7 +1505,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; /* ensure we can see the force_cow */ @@ -1516,7 +1518,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, * 3) the root is not forced COW. * * What is forced COW: - * when we create snapshot during commiting the transaction, + * when we create snapshot during committing the transaction, * after we've finished coping src root, we must COW the shared * block to ensure the metadata consistency. */ @@ -1531,7 +1533,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, /* * cows a single block, see __btrfs_cow_block for the real work. - * This version of it has extra checks so that a block isn't cow'd more than + * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, @@ -1552,6 +1554,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, root->fs_info->generation); if (!should_cow_block(trans, root, buf)) { + trans->dirty = true; *cow_ret = buf; return 0; } @@ -1768,6 +1771,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb, unsigned long map_len = 0; int err; + if (low > high) { + btrfs_err(eb->fs_info, + "%s: low (%d) > high (%d) eb %llu owner %llu level %d", + __func__, low, high, eb->start, + btrfs_header_owner(eb), btrfs_header_level(eb)); + return -EINVAL; + } + while (low < high) { mid = (low + high) / 2; offset = p + mid * item_size; @@ -1783,10 +1794,12 @@ static noinline int generic_bin_search(struct extent_buffer *eb, if (!err) { tmp = (struct btrfs_disk_key *)(kaddr + offset - map_start); - } else { + } else if (err == 1) { read_extent_buffer(eb, &unaligned, offset, sizeof(unaligned)); tmp = &unaligned; + } else { + return err; } } else { @@ -1853,7 +1866,6 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). - * NULL is returned on error. */ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, struct extent_buffer *parent, int slot) @@ -1861,19 +1873,16 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, int level = btrfs_header_level(parent); struct extent_buffer *eb; - if (slot < 0) - return NULL; - if (slot >= btrfs_header_nritems(parent)) - return NULL; + if (slot < 0 || slot >= btrfs_header_nritems(parent)) + return ERR_PTR(-ENOENT); BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { - if (!IS_ERR(eb)) - free_extent_buffer(eb); - eb = NULL; + if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + eb = ERR_PTR(-EIO); } return eb; @@ -1926,9 +1935,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); - if (!child) { - ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + if (IS_ERR(child)) { + ret = PTR_ERR(child); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } @@ -1965,6 +1974,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; + if (left) { btrfs_tree_lock(left); btrfs_set_lock_blocking(left); @@ -1975,7 +1987,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } } + right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; + if (right) { btrfs_tree_lock(right); btrfs_set_lock_blocking(right); @@ -2031,7 +2047,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); @@ -2130,6 +2146,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return 1; left = read_node_slot(root, parent, pslot - 1); + if (IS_ERR(left)) + left = NULL; /* first, try to make some room in the middle buffer */ if (left) { @@ -2180,6 +2198,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, free_extent_buffer(left); } right = read_node_slot(root, parent, pslot + 1); + if (IS_ERR(right)) + right = NULL; /* * then try to empty the right most buffer into the middle @@ -2510,6 +2530,8 @@ read_block_for_search(struct btrfs_trans_handle *trans, if (!btrfs_buffer_uptodate(tmp, 0, 0)) ret = -EIO; free_extent_buffer(tmp); + } else { + ret = PTR_ERR(tmp); } return ret; } @@ -2773,8 +2795,10 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (!should_cow_block(trans, root, b)) + if (!should_cow_block(trans, root, b)) { + trans->dirty = true; goto cow_done; + } /* * must have write locks on this node and the @@ -2823,6 +2847,8 @@ cow_done: } ret = key_search(b, key, level, &prev_cmp, &slot); + if (ret < 0) + goto done; if (level != 0) { int dec = 0; @@ -2986,7 +3012,7 @@ again: btrfs_unlock_up_safe(p, level + 1); /* - * Since we can unwind eb's we want to do a real search every + * Since we can unwind ebs we want to do a real search every * time. */ prev_cmp = -1; @@ -3229,7 +3255,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3304,7 +3330,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, src_nritems - push_items, push_items); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(dst, src, @@ -3508,7 +3534,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } copy_extent_buffer(split, c, @@ -3762,7 +3788,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); - if (right == NULL) + /* + * slot + 1 is not valid or we fail to read the right node, + * no big deal, just return. + */ + if (IS_ERR(right)) return 1; btrfs_tree_lock(right); @@ -3992,7 +4022,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); - if (left == NULL) + /* + * slot - 1 is not valid or we fail to read the left node, + * no big deal, just return. + */ + if (IS_ERR(left)) return 1; btrfs_tree_lock(left); @@ -5199,7 +5233,10 @@ find_next_key: } btrfs_set_path_blocking(path); cur = read_node_slot(root, cur, slot); - BUG_ON(!cur); /* -ENOMEM */ + if (IS_ERR(cur)) { + ret = PTR_ERR(cur); + goto out; + } btrfs_tree_read_lock(cur); @@ -5218,15 +5255,21 @@ out: return ret; } -static void tree_move_down(struct btrfs_root *root, +static int tree_move_down(struct btrfs_root *root, struct btrfs_path *path, int *level, int root_level) { + struct extent_buffer *eb; + BUG_ON(*level == 0); - path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level], - path->slots[*level]); + eb = read_node_slot(root, path->nodes[*level], path->slots[*level]); + if (IS_ERR(eb)) + return PTR_ERR(eb); + + path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + return 0; } static int tree_move_next_or_upnext(struct btrfs_root *root, @@ -5271,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root, if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(root, path, level, root_level); } else { - tree_move_down(root, path, level, root_level); - ret = 0; + ret = tree_move_down(root, path, level, root_level); } if (ret >= 0) { if (*level == 0) @@ -5446,8 +5488,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, left_root_level, advance_left != ADVANCE_ONLY_NEXT, &left_key); - if (ret < 0) + if (ret == -1) left_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -5455,8 +5499,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level, advance_right != ADVANCE_ONLY_NEXT, &right_key); - if (ret < 0) + if (ret == -1) right_end_reached = ADVANCE; + else if (ret < 0) + goto out; advance_right = 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84a6a5b3384a..33fe03551105 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -33,6 +33,7 @@ #include <asm/kmap_types.h> #include <linux/pagemap.h> #include <linux/btrfs.h> +#include <linux/btrfs_tree.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/sizes.h> @@ -64,98 +65,6 @@ struct btrfs_ordered_sum; #define BTRFS_COMPAT_EXTENT_TREE_V0 -/* holds pointers to all of the tree roots */ -#define BTRFS_ROOT_TREE_OBJECTID 1ULL - -/* stores information about which extents are in use, and reference counts */ -#define BTRFS_EXTENT_TREE_OBJECTID 2ULL - -/* - * chunk tree stores translations from logical -> physical block numbering - * the super block points to the chunk tree - */ -#define BTRFS_CHUNK_TREE_OBJECTID 3ULL - -/* - * stores information about which areas of a given device are in use. - * one per device. The tree of tree roots points to the device tree - */ -#define BTRFS_DEV_TREE_OBJECTID 4ULL - -/* one per subvolume, storing files and directories */ -#define BTRFS_FS_TREE_OBJECTID 5ULL - -/* directory objectid inside the root tree */ -#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL - -/* holds checksums of all the data extents */ -#define BTRFS_CSUM_TREE_OBJECTID 7ULL - -/* holds quota configuration and tracking */ -#define BTRFS_QUOTA_TREE_OBJECTID 8ULL - -/* for storing items that use the BTRFS_UUID_KEY* types */ -#define BTRFS_UUID_TREE_OBJECTID 9ULL - -/* tracks free space in block groups. */ -#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL - -/* device stats in the device tree */ -#define BTRFS_DEV_STATS_OBJECTID 0ULL - -/* for storing balance parameters in the root tree */ -#define BTRFS_BALANCE_OBJECTID -4ULL - -/* orhpan objectid for tracking unlinked/truncated files */ -#define BTRFS_ORPHAN_OBJECTID -5ULL - -/* does write ahead logging to speed up fsyncs */ -#define BTRFS_TREE_LOG_OBJECTID -6ULL -#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL - -/* for space balancing */ -#define BTRFS_TREE_RELOC_OBJECTID -8ULL -#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL - -/* - * extent checksums all have this objectid - * this allows them to share the logging tree - * for fsyncs - */ -#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL - -/* For storing free space cache */ -#define BTRFS_FREE_SPACE_OBJECTID -11ULL - -/* - * The inode number assigned to the special inode for storing - * free ino cache - */ -#define BTRFS_FREE_INO_OBJECTID -12ULL - -/* dummy objectid represents multiple objectids */ -#define BTRFS_MULTIPLE_OBJECTIDS -255ULL - -/* - * All files have objectids in this range. - */ -#define BTRFS_FIRST_FREE_OBJECTID 256ULL -#define BTRFS_LAST_FREE_OBJECTID -256ULL -#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL - - -/* - * the device items go into the chunk tree. The key is in the form - * [ 1 BTRFS_DEV_ITEM_KEY device_id ] - */ -#define BTRFS_DEV_ITEMS_OBJECTID 1ULL - -#define BTRFS_BTREE_INODE_OBJECTID 1 - -#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 - -#define BTRFS_DEV_REPLACE_DEVID 0ULL - /* * the max metadata block size. This limit is somewhat artificial, * but the memmove costs go through the roof for larger blocks. @@ -175,31 +84,14 @@ struct btrfs_ordered_sum; */ #define BTRFS_LINK_MAX 65535U -/* 32 bytes in various csum fields */ -#define BTRFS_CSUM_SIZE 32 - -/* csum types */ -#define BTRFS_CSUM_TYPE_CRC32 0 - static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 -/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ +/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */ #define REQ_GET_READ_MIRRORS (1 << 30) -#define BTRFS_FT_UNKNOWN 0 -#define BTRFS_FT_REG_FILE 1 -#define BTRFS_FT_DIR 2 -#define BTRFS_FT_CHRDEV 3 -#define BTRFS_FT_BLKDEV 4 -#define BTRFS_FT_FIFO 5 -#define BTRFS_FT_SOCK 6 -#define BTRFS_FT_SYMLINK 7 -#define BTRFS_FT_XATTR 8 -#define BTRFS_FT_MAX 9 - /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) @@ -207,138 +99,10 @@ static const int btrfs_csum_sizes[] = { 4 }; #define BTRFS_MAX_EXTENT_SIZE SZ_128M -/* - * The key defines the order in the tree, and so it also defines (optimal) - * block layout. - * - * objectid corresponds to the inode number. - * - * type tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with type of 1 might refer to the inode data, - * type of 2 may point to file data in the btree and type == 3 may point to - * extents. - * - * offset is the starting byte offset for this key in the stream. - * - * btrfs_disk_key is in disk byte order. struct btrfs_key is always - * in cpu native order. Otherwise they are identical and their sizes - * should be the same (ie both packed) - */ -struct btrfs_disk_key { - __le64 objectid; - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -struct btrfs_key { - u64 objectid; - u8 type; - u64 offset; -} __attribute__ ((__packed__)); - struct btrfs_mapping_tree { struct extent_map_tree map_tree; }; -struct btrfs_dev_item { - /* the internal btrfs device id */ - __le64 devid; - - /* size of the device */ - __le64 total_bytes; - - /* bytes used */ - __le64 bytes_used; - - /* optimal io alignment for this device */ - __le32 io_align; - - /* optimal io width for this device */ - __le32 io_width; - - /* minimal io size for this device */ - __le32 sector_size; - - /* type and info about this device */ - __le64 type; - - /* expected generation for this device */ - __le64 generation; - - /* - * starting byte of this partition on the device, - * to allow for stripe alignment in the future - */ - __le64 start_offset; - - /* grouping information for allocation decisions */ - __le32 dev_group; - - /* seek speed 0-100 where 100 is fastest */ - u8 seek_speed; - - /* bandwidth 0-100 where 100 is fastest */ - u8 bandwidth; - - /* btrfs generated uuid for this device */ - u8 uuid[BTRFS_UUID_SIZE]; - - /* uuid of FS who owns this device */ - u8 fsid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_stripe { - __le64 devid; - __le64 offset; - u8 dev_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_chunk { - /* size of this chunk in bytes */ - __le64 length; - - /* objectid of the root referencing this chunk */ - __le64 owner; - - __le64 stripe_len; - __le64 type; - - /* optimal io alignment for this chunk */ - __le32 io_align; - - /* optimal io width for this chunk */ - __le32 io_width; - - /* minimal io size for this chunk */ - __le32 sector_size; - - /* 2^16 stripes is quite a lot, a second limit is the size of a single - * item in the btree - */ - __le16 num_stripes; - - /* sub stripes only matter for raid10 */ - __le16 sub_stripes; - struct btrfs_stripe stripe; - /* additional stripes go here */ -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_EXTENT 1 -#define BTRFS_FREE_SPACE_BITMAP 2 - -struct btrfs_free_space_entry { - __le64 offset; - __le64 bytes; - u8 type; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_header { - struct btrfs_disk_key location; - __le64 generation; - __le64 num_entries; - __le64 num_bitmaps; -} __attribute__ ((__packed__)); - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -346,9 +110,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) -#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) - /* * File system states */ @@ -356,13 +117,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) #define BTRFS_FS_STATE_REMOUNTING 1 #define BTRFS_FS_STATE_TRANS_ABORTED 2 #define BTRFS_FS_STATE_DEV_REPLACING 3 - -/* Super block flags */ -/* Errors detected */ -#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) - -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) -#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) +#define BTRFS_FS_STATE_DUMMY_FS_INFO 4 #define BTRFS_BACKREF_REV_MAX 256 #define BTRFS_BACKREF_REV_SHIFT 56 @@ -390,27 +145,11 @@ struct btrfs_header { u8 level; } __attribute__ ((__packed__)); -#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ - sizeof(struct btrfs_header)) / \ - sizeof(struct btrfs_key_ptr)) -#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize)) -#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ - (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) - \ - BTRFS_FILE_EXTENT_INLINE_DATA_START) -#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ - sizeof(struct btrfs_item) -\ - sizeof(struct btrfs_dir_item)) - - /* * this is a very generous portion of the super block, giving us * room to translate 14 chunks with 3 stripes each. */ #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 -#define BTRFS_LABEL_SIZE 256 /* * just in case we somehow lose the roots and are not able to mount, @@ -507,31 +246,6 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ -#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) - -#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) -#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) -/* - * some patches floated around with a second compression method - * lets save that incompat here for when they do get in - * Note we don't actually support it, we're just reserving the - * number - */ -#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4) - -/* - * older kernels tried to do bigger metadata blocks, but the - * code was pretty buggy. Lets not let them try anymore. - */ -#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) - -#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) -#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) -#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) -#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) - #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL @@ -624,357 +338,8 @@ struct btrfs_path { unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; }; - -/* - * items in the extent btree are used to record the objectid of the - * owner of the block and the number of references - */ - -struct btrfs_extent_item { - __le64 refs; - __le64 generation; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_extent_item_v0 { - __le32 refs; -} __attribute__ ((__packed__)); - #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ sizeof(struct btrfs_item)) - -#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) -#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) - -/* following flags only apply to tree blocks */ - -/* use full backrefs for extent pointers in the block */ -#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) - -/* - * this flag is only used internally by scrub and may be changed at any time - * it is only declared here to avoid collisions - */ -#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48) - -struct btrfs_tree_block_info { - struct btrfs_disk_key key; - u8 level; -} __attribute__ ((__packed__)); - -struct btrfs_extent_data_ref { - __le64 root; - __le64 objectid; - __le64 offset; - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_shared_data_ref { - __le32 count; -} __attribute__ ((__packed__)); - -struct btrfs_extent_inline_ref { - u8 type; - __le64 offset; -} __attribute__ ((__packed__)); - -/* old style backrefs item */ -struct btrfs_extent_ref_v0 { - __le64 root; - __le64 generation; - __le64 objectid; - __le32 count; -} __attribute__ ((__packed__)); - - -/* dev extents record free space on individual devices. The owner - * field points back to the chunk allocation mapping tree that allocated - * the extent. The chunk tree uuid field is a way to double check the owner - */ -struct btrfs_dev_extent { - __le64 chunk_tree; - __le64 chunk_objectid; - __le64 chunk_offset; - __le64 length; - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; -} __attribute__ ((__packed__)); - -struct btrfs_inode_ref { - __le64 index; - __le16 name_len; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_inode_extref { - __le64 parent_objectid; - __le64 index; - __le16 name_len; - __u8 name[0]; - /* name goes here */ -} __attribute__ ((__packed__)); - -struct btrfs_timespec { - __le64 sec; - __le32 nsec; -} __attribute__ ((__packed__)); - -struct btrfs_inode_item { - /* nfs style generation number */ - __le64 generation; - /* transid that last touched this inode */ - __le64 transid; - __le64 size; - __le64 nbytes; - __le64 block_group; - __le32 nlink; - __le32 uid; - __le32 gid; - __le32 mode; - __le64 rdev; - __le64 flags; - - /* modification sequence number for NFS */ - __le64 sequence; - - /* - * a little future expansion, for more than this we can - * just grow the inode item and version it - */ - __le64 reserved[4]; - struct btrfs_timespec atime; - struct btrfs_timespec ctime; - struct btrfs_timespec mtime; - struct btrfs_timespec otime; -} __attribute__ ((__packed__)); - -struct btrfs_dir_log_item { - __le64 end; -} __attribute__ ((__packed__)); - -struct btrfs_dir_item { - struct btrfs_disk_key location; - __le64 transid; - __le16 data_len; - __le16 name_len; - u8 type; -} __attribute__ ((__packed__)); - -#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0) - -/* - * Internal in-memory flag that a subvolume has been marked for deletion but - * still visible as a directory - */ -#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48) - -struct btrfs_root_item { - struct btrfs_inode_item inode; - __le64 generation; - __le64 root_dirid; - __le64 bytenr; - __le64 byte_limit; - __le64 bytes_used; - __le64 last_snapshot; - __le64 flags; - __le32 refs; - struct btrfs_disk_key drop_progress; - u8 drop_level; - u8 level; - - /* - * The following fields appear after subvol_uuids+subvol_times - * were introduced. - */ - - /* - * This generation number is used to test if the new fields are valid - * and up to date while reading the root item. Every time the root item - * is written out, the "generation" field is copied into this field. If - * anyone ever mounted the fs with an older kernel, we will have - * mismatching generation values here and thus must invalidate the - * new fields. See btrfs_update_root and btrfs_find_last_root for - * details. - * the offset of generation_v2 is also used as the start for the memset - * when invalidating the fields. - */ - __le64 generation_v2; - u8 uuid[BTRFS_UUID_SIZE]; - u8 parent_uuid[BTRFS_UUID_SIZE]; - u8 received_uuid[BTRFS_UUID_SIZE]; - __le64 ctransid; /* updated when an inode changes */ - __le64 otransid; /* trans when created */ - __le64 stransid; /* trans when sent. non-zero for received subvol */ - __le64 rtransid; /* trans when received. non-zero for received subvol */ - struct btrfs_timespec ctime; - struct btrfs_timespec otime; - struct btrfs_timespec stime; - struct btrfs_timespec rtime; - __le64 reserved[8]; /* for future */ -} __attribute__ ((__packed__)); - -/* - * this is used for both forward and backward root refs - */ -struct btrfs_root_ref { - __le64 dirid; - __le64 sequence; - __le16 name_len; -} __attribute__ ((__packed__)); - -struct btrfs_disk_balance_args { - /* - * profiles to operate on, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 profiles; - - /* - * usage filter - * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' - * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max - */ - union { - __le64 usage; - struct { - __le32 usage_min; - __le32 usage_max; - }; - }; - - /* devid filter */ - __le64 devid; - - /* devid subset filter [pstart..pend) */ - __le64 pstart; - __le64 pend; - - /* btrfs virtual address space subset filter [vstart..vend) */ - __le64 vstart; - __le64 vend; - - /* - * profile to convert to, single is denoted by - * BTRFS_AVAIL_ALLOC_BIT_SINGLE - */ - __le64 target; - - /* BTRFS_BALANCE_ARGS_* */ - __le64 flags; - - /* - * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' - * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum - * and maximum - */ - union { - __le64 limit; - struct { - __le32 limit_min; - __le32 limit_max; - }; - }; - - /* - * Process chunks that cross stripes_min..stripes_max devices, - * BTRFS_BALANCE_ARGS_STRIPES_RANGE - */ - __le32 stripes_min; - __le32 stripes_max; - - __le64 unused[6]; -} __attribute__ ((__packed__)); - -/* - * store balance parameters to disk so that balance can be properly - * resumed after crash or unmount - */ -struct btrfs_balance_item { - /* BTRFS_BALANCE_* */ - __le64 flags; - - struct btrfs_disk_balance_args data; - struct btrfs_disk_balance_args meta; - struct btrfs_disk_balance_args sys; - - __le64 unused[4]; -} __attribute__ ((__packed__)); - -#define BTRFS_FILE_EXTENT_INLINE 0 -#define BTRFS_FILE_EXTENT_REG 1 -#define BTRFS_FILE_EXTENT_PREALLOC 2 - -struct btrfs_file_extent_item { - /* - * transaction id that created this extent - */ - __le64 generation; - /* - * max number of bytes to hold this extent in ram - * when we split a compressed extent we can't know how big - * each of the resulting pieces will be. So, this is - * an upper limit on the size of the extent in ram instead of - * an exact limit. - */ - __le64 ram_bytes; - - /* - * 32 bits for the various ways we might encode the data, - * including compression and encryption. If any of these - * are set to something a given disk format doesn't understand - * it is treated like an incompat flag for reading and writing, - * but not for stat. - */ - u8 compression; - u8 encryption; - __le16 other_encoding; /* spare for later use */ - - /* are we inline data or a real extent? */ - u8 type; - - /* - * disk space consumed by the extent, checksum blocks are included - * in these numbers - * - * At this offset in the structure, the inline extent data start. - */ - __le64 disk_bytenr; - __le64 disk_num_bytes; - /* - * the logical offset in file blocks (no csums) - * this extent record is for. This allows a file extent to point - * into the middle of an existing extent on disk, sharing it - * between two snapshots (useful if some bytes in the middle of the - * extent have changed - */ - __le64 offset; - /* - * the logical number of file blocks (no csums included). This - * always reflects the size uncompressed and without encoding. - */ - __le64 num_bytes; - -} __attribute__ ((__packed__)); - -struct btrfs_csum_item { - u8 csum; -} __attribute__ ((__packed__)); - -struct btrfs_dev_stats_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 values[BTRFS_DEV_STAT_VALUES_MAX]; -} __attribute__ ((__packed__)); - -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 -#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0 -#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1 -#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2 -#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3 -#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4 - struct btrfs_dev_replace { u64 replace_state; /* see #define above */ u64 time_started; /* seconds since 1-Jan-1970 */ @@ -1005,175 +370,6 @@ struct btrfs_dev_replace { struct btrfs_scrub_progress scrub_progress; }; -struct btrfs_dev_replace_item { - /* - * grow this item struct at the end for future enhancements and keep - * the existing values unchanged - */ - __le64 src_devid; - __le64 cursor_left; - __le64 cursor_right; - __le64 cont_reading_from_srcdev_mode; - - __le64 replace_state; - __le64 time_started; - __le64 time_stopped; - __le64 num_write_errors; - __le64 num_uncorrectable_read_errors; -} __attribute__ ((__packed__)); - -/* different types of block groups (and chunks) */ -#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0) -#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1) -#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2) -#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3) -#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) -#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) -#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) -#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) -#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) -#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ - BTRFS_SPACE_INFO_GLOBAL_RSV) - -enum btrfs_raid_types { - BTRFS_RAID_RAID10, - BTRFS_RAID_RAID1, - BTRFS_RAID_DUP, - BTRFS_RAID_RAID0, - BTRFS_RAID_SINGLE, - BTRFS_RAID_RAID5, - BTRFS_RAID_RAID6, - BTRFS_NR_RAID_TYPES -}; - -#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ - BTRFS_BLOCK_GROUP_SYSTEM | \ - BTRFS_BLOCK_GROUP_METADATA) - -#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ - BTRFS_BLOCK_GROUP_RAID1 | \ - BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6 | \ - BTRFS_BLOCK_GROUP_DUP | \ - BTRFS_BLOCK_GROUP_RAID10) -#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ - BTRFS_BLOCK_GROUP_RAID6) - -/* - * We need a bit for restriper to be able to tell when chunks of type - * SINGLE are available. This "extended" profile format is used in - * fs_info->avail_*_alloc_bits (in-memory) and balance item fields - * (on-disk). The corresponding on-disk bit in chunk.type is reserved - * to avoid remappings between two formats in future. - */ -#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48) - -/* - * A fake block group type that is used to communicate global block reserve - * size to userspace via the SPACE_INFO ioctl. - */ -#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49) - -#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \ - BTRFS_AVAIL_ALLOC_BIT_SINGLE) - -static inline u64 chunk_to_extended(u64 flags) -{ - if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0) - flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE; - - return flags; -} -static inline u64 extended_to_chunk(u64 flags) -{ - return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE; -} - -struct btrfs_block_group_item { - __le64 used; - __le64 chunk_objectid; - __le64 flags; -} __attribute__ ((__packed__)); - -struct btrfs_free_space_info { - __le32 extent_count; - __le32 flags; -} __attribute__ ((__packed__)); - -#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) - -#define BTRFS_QGROUP_LEVEL_SHIFT 48 -static inline u64 btrfs_qgroup_level(u64 qgroupid) -{ - return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; -} - -/* - * is subvolume quota turned on? - */ -#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) -/* - * RESCAN is set during the initialization phase - */ -#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) -/* - * Some qgroup entries are known to be out of date, - * either because the configuration has changed in a way that - * makes a rescan necessary, or because the fs has been mounted - * with a non-qgroup-aware version. - * Turning qouta off and on again makes it inconsistent, too. - */ -#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2) - -#define BTRFS_QGROUP_STATUS_VERSION 1 - -struct btrfs_qgroup_status_item { - __le64 version; - /* - * the generation is updated during every commit. As older - * versions of btrfs are not aware of qgroups, it will be - * possible to detect inconsistencies by checking the - * generation on mount time - */ - __le64 generation; - - /* flag definitions see above */ - __le64 flags; - - /* - * only used during scanning to record the progress - * of the scan. It contains a logical address - */ - __le64 rescan; -} __attribute__ ((__packed__)); - -struct btrfs_qgroup_info_item { - __le64 generation; - __le64 rfer; - __le64 rfer_cmpr; - __le64 excl; - __le64 excl_cmpr; -} __attribute__ ((__packed__)); - -/* flags definition for qgroup limits */ -#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0) -#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1) -#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2) -#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3) -#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4) -#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5) - -struct btrfs_qgroup_limit_item { - /* - * only updated when any of the other values change - */ - __le64 flags; - __le64 max_rfer; - __le64 max_excl; - __le64 rsv_rfer; - __le64 rsv_excl; -} __attribute__ ((__packed__)); - /* For raid type sysfs entries */ struct raid_kobject { int raid_type; @@ -1221,7 +417,7 @@ struct btrfs_space_info { * bytes_pinned does not reflect the bytes that will be pinned once the * delayed refs are flushed, so this counter is inc'ed every time we * call btrfs_free_extent so it is a realtime count of what will be - * freed once the transaction is committed. It will be zero'ed every + * freed once the transaction is committed. It will be zeroed every * time the transaction commits. */ struct percpu_counter total_bytes_pinned; @@ -1229,6 +425,9 @@ struct btrfs_space_info { struct list_head list; /* Protected by the spinlock 'lock'. */ struct list_head ro_bgs; + struct list_head priority_tickets; + struct list_head tickets; + u64 tickets_id; struct rw_semaphore groups_sem; /* for block groups in our same type */ @@ -1408,6 +607,27 @@ struct btrfs_block_group_cache { struct btrfs_io_ctl io_ctl; + /* + * Incremented when doing extent allocations and holding a read lock + * on the space_info's groups_sem semaphore. + * Decremented when an ordered extent that represents an IO against this + * block group's range is created (after it's added to its inode's + * root's list of ordered extents) or immediately after the allocation + * if it's a metadata extent or fallocate extent (for these cases we + * don't create ordered extents). + */ + atomic_t reservations; + + /* + * Incremented while holding the spinlock *lock* by a task checking if + * it can perform a nocow write (incremented if the value for the *ro* + * field is 0). Decremented by such tasks once they create an ordered + * extent or before that if some error happens before reaching that step. + * This is to prevent races between block group relocation and nocow + * writes through direct IO. + */ + atomic_t nocow_writers; + /* Lock for free space tree operations. */ struct mutex free_space_lock; @@ -1809,6 +1029,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *qgroup_rescan_workers; struct completion qgroup_rescan_completion; struct btrfs_work qgroup_rescan_work; + bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ /* filesystem state */ unsigned long fs_state; @@ -1860,6 +1081,8 @@ struct btrfs_fs_info { struct list_head pinned_chunks; int creating_free_space_tree; + /* Used to record internally whether fs has been frozen */ + int fs_frozen; }; struct btrfs_subvolume_writers { @@ -1881,12 +1104,11 @@ struct btrfs_subvolume_writers { #define BTRFS_ROOT_REF_COWS 1 #define BTRFS_ROOT_TRACK_DIRTY 2 #define BTRFS_ROOT_IN_RADIX 3 -#define BTRFS_ROOT_DUMMY_ROOT 4 -#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5 -#define BTRFS_ROOT_DEFRAG_RUNNING 6 -#define BTRFS_ROOT_FORCE_COW 7 -#define BTRFS_ROOT_MULTI_LOG_TASKS 8 -#define BTRFS_ROOT_DIRTY 9 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4 +#define BTRFS_ROOT_DEFRAG_RUNNING 5 +#define BTRFS_ROOT_FORCE_COW 6 +#define BTRFS_ROOT_MULTI_LOG_TASKS 7 +#define BTRFS_ROOT_DIRTY 8 /* * in ram representation of the tree. extent_root is used for all allocations @@ -1948,8 +1170,10 @@ struct btrfs_root { u64 highest_objectid; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ u64 alloc_bytenr; +#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -2026,227 +1250,38 @@ struct btrfs_root { atomic_t qgroup_meta_rsv; }; -struct btrfs_ioctl_defrag_range_args { - /* start of the defrag operation */ - __u64 start; - - /* number of bytes to defrag, use (u64)-1 to say all */ - __u64 len; - - /* - * flags for the operation, which can include turning - * on compression for this one defrag - */ - __u64 flags; - - /* - * any extent bigger than this will be considered - * already defragged. Use 0 to take the kernel default - * Use 1 to say every single extent must be rewritten - */ - __u32 extent_thresh; - - /* - * which compression method to use if turning on compression - * for this defrag operation. If unspecified, zlib will - * be used - */ - __u32 compress_type; - - /* spare for later */ - __u32 unused[4]; -}; - - -/* - * inode items have the data typically returned from stat and store other - * info about object characteristics. There is one for every file and dir in - * the FS - */ -#define BTRFS_INODE_ITEM_KEY 1 -#define BTRFS_INODE_REF_KEY 12 -#define BTRFS_INODE_EXTREF_KEY 13 -#define BTRFS_XATTR_ITEM_KEY 24 -#define BTRFS_ORPHAN_ITEM_KEY 48 -/* reserve 2-15 close to the inode for later flexibility */ - -/* - * dir items are the name -> inode pointers in a directory. There is one - * for every name in a directory. - */ -#define BTRFS_DIR_LOG_ITEM_KEY 60 -#define BTRFS_DIR_LOG_INDEX_KEY 72 -#define BTRFS_DIR_ITEM_KEY 84 -#define BTRFS_DIR_INDEX_KEY 96 -/* - * extent data is for file data - */ -#define BTRFS_EXTENT_DATA_KEY 108 - -/* - * extent csums are stored in a separate tree and hold csums for - * an entire extent on disk. - */ -#define BTRFS_EXTENT_CSUM_KEY 128 - -/* - * root items point to tree roots. They are typically in the root - * tree used by the super block to find all the other trees - */ -#define BTRFS_ROOT_ITEM_KEY 132 - -/* - * root backrefs tie subvols and snapshots to the directory entries that - * reference them - */ -#define BTRFS_ROOT_BACKREF_KEY 144 - -/* - * root refs make a fast index for listing all of the snapshots and - * subvolumes referenced by a given root. They point directly to the - * directory item in the root that references the subvol - */ -#define BTRFS_ROOT_REF_KEY 156 - -/* - * extent items are in the extent map tree. These record which blocks - * are used, and how many references there are to each block - */ -#define BTRFS_EXTENT_ITEM_KEY 168 - -/* - * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know - * the length, so we save the level in key->offset instead of the length. - */ -#define BTRFS_METADATA_ITEM_KEY 169 - -#define BTRFS_TREE_BLOCK_REF_KEY 176 - -#define BTRFS_EXTENT_DATA_REF_KEY 178 - -#define BTRFS_EXTENT_REF_V0_KEY 180 - -#define BTRFS_SHARED_BLOCK_REF_KEY 182 - -#define BTRFS_SHARED_DATA_REF_KEY 184 - -/* - * block groups give us hints into the extent allocation trees. Which - * blocks are free etc etc - */ -#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 - -/* - * Every block group is represented in the free space tree by a free space info - * item, which stores some accounting information. It is keyed on - * (block_group_start, FREE_SPACE_INFO, block_group_length). - */ -#define BTRFS_FREE_SPACE_INFO_KEY 198 - -/* - * A free space extent tracks an extent of space that is free in a block group. - * It is keyed on (start, FREE_SPACE_EXTENT, length). - */ -#define BTRFS_FREE_SPACE_EXTENT_KEY 199 - -/* - * When a block group becomes very fragmented, we convert it to use bitmaps - * instead of extents. A free space bitmap is keyed on - * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with - * (length / sectorsize) bits. - */ -#define BTRFS_FREE_SPACE_BITMAP_KEY 200 - -#define BTRFS_DEV_EXTENT_KEY 204 -#define BTRFS_DEV_ITEM_KEY 216 -#define BTRFS_CHUNK_ITEM_KEY 228 - -/* - * Records the overall state of the qgroups. - * There's only one instance of this key present, - * (0, BTRFS_QGROUP_STATUS_KEY, 0) - */ -#define BTRFS_QGROUP_STATUS_KEY 240 -/* - * Records the currently used space of the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid). - */ -#define BTRFS_QGROUP_INFO_KEY 242 -/* - * Contains the user configured limits for the qgroup. - * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid). - */ -#define BTRFS_QGROUP_LIMIT_KEY 244 -/* - * Records the child-parent relationship of qgroups. For - * each relation, 2 keys are present: - * (childid, BTRFS_QGROUP_RELATION_KEY, parentid) - * (parentid, BTRFS_QGROUP_RELATION_KEY, childid) - */ -#define BTRFS_QGROUP_RELATION_KEY 246 - -/* - * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY. - */ -#define BTRFS_BALANCE_ITEM_KEY 248 - -/* - * The key type for tree items that are stored persistently, but do not need to - * exist for extended period of time. The items can exist in any tree. - * - * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data] - * - * Existing items: - * - * - balance status item - * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0) - */ -#define BTRFS_TEMPORARY_ITEM_KEY 248 +static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) +{ + return blocksize - sizeof(struct btrfs_header); +} -/* - * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY - */ -#define BTRFS_DEV_STATS_KEY 249 +static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root) +{ + return __BTRFS_LEAF_DATA_SIZE(root->nodesize); +} -/* - * The key type for tree items that are stored persistently and usually exist - * for a long period, eg. filesystem lifetime. The item kinds can be status - * information, stats or preference values. The item can exist in any tree. - * - * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data] - * - * Existing items: - * - * - device statistics, store IO stats in the device tree, one key for all - * stats - * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0) - */ -#define BTRFS_PERSISTENT_ITEM_KEY 249 +static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); +} -/* - * Persistantly stores the device replace state in the device tree. - * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0). - */ -#define BTRFS_DEV_REPLACE_KEY 250 +static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root) +{ + return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr); +} -/* - * Stores items that allow to quickly map UUIDs to something else. - * These items are part of the filesystem UUID tree. - * The key is built like this: - * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits). - */ -#if BTRFS_UUID_SIZE != 16 -#error "UUID items require BTRFS_UUID_SIZE == 16!" -#endif -#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */ -#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to - * received subvols */ +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} -/* - * string items are for debugging. They just store a short string of - * data in the FS - */ -#define BTRFS_STRING_ITEM_KEY 253 +static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root) +{ + return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item); +} /* * Flags for mount options. @@ -2288,21 +1323,21 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) -#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -#define btrfs_set_and_info(root, opt, fmt, args...) \ +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ { \ - if (!btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_set_opt(root->fs_info->mount_opt, opt); \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ } -#define btrfs_clear_and_info(root, opt, fmt, args...) \ +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ { \ - if (btrfs_test_opt(root, opt)) \ - btrfs_info(root->fs_info, fmt, ##args); \ - btrfs_clear_opt(root->fs_info->mount_opt, opt); \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ } #ifdef CONFIG_BTRFS_DEBUG @@ -2310,9 +1345,9 @@ static inline int btrfs_should_fragment_free_space(struct btrfs_root *root, struct btrfs_block_group_cache *block_group) { - return (btrfs_test_opt(root, FRAGMENT_METADATA) && + return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) && block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(root, FRAGMENT_DATA) && + (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) && block_group->flags & BTRFS_BLOCK_GROUP_DATA); } #endif @@ -2392,7 +1427,7 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) token->kaddr = NULL; } -/* some macros to generate set/get funcs for the struct fields. This +/* some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: */ @@ -3499,11 +2534,17 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start); +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_root *root, - unsigned long count, int wait); + unsigned long count, u64 transid, int wait); int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, @@ -3541,7 +2582,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -3609,6 +2650,15 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; +enum btrfs_flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELALLOC = 3, + FLUSH_DELALLOC_WAIT = 4, + ALLOC_CHUNK = 5, + COMMIT_TRANS = 6, +}; + int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); @@ -3646,8 +2696,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, - u64 num_bytes); + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, + int update_size); int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); @@ -3860,9 +2910,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ -int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, @@ -4076,7 +3123,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, struct btrfs_root *parent_root, u64 new_dirid); -int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); @@ -4122,6 +3169,7 @@ void btrfs_test_inode_set_ops(struct inode *inode); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_update_iflags(struct inode *inode); void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); @@ -4326,19 +3374,58 @@ static inline void assfail(char *expr, char *file, int line) #define ASSERT(expr) ((void)0) #endif -#define btrfs_assert() __printf(5, 6) __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); const char *btrfs_decode_error(int errno); __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno); +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact line number is reported. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno)); \ +} while (0) + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + + +/* compatibility and incompatibility defines */ + #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) @@ -4455,44 +3542,6 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) return !!(btrfs_super_compat_ro_flags(disk_super) & flag); } -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact line number is reported. - */ -#define btrfs_abort_transaction(trans, root, errno) \ -do { \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((root)->fs_info->fs_state))) { \ - WARN(1, KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno)); \ - } \ - __btrfs_abort_transaction((trans), (root), __func__, \ - __LINE__, (errno)); \ -} while (0) - -#define btrfs_std_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_std_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -__printf(5, 6) -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -/* - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic - * will panic(). Otherwise we BUG() here. - */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ - BUG(); \ -} while (0) - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); @@ -4582,13 +3631,13 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) void btrfs_test_destroy_inode(struct inode *inode); #endif -static inline int btrfs_test_is_dummy_root(struct btrfs_root *root) +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) return 1; #endif return 0; } - #endif diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h new file mode 100644 index 000000000000..83ebfe28da9e --- /dev/null +++ b/fs/btrfs/dedupe.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2016 Fujitsu. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_DEDUPE__ +#define __BTRFS_DEDUPE__ + +/* later in-band dedupe will expand this struct */ +struct btrfs_dedupe_hash; +#endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6cef0062f929..3eeb9cd8cfa5 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -34,7 +34,7 @@ int __init btrfs_delayed_inode_init(void) delayed_node_cache = kmem_cache_create("btrfs_delayed_node", sizeof(struct btrfs_delayed_node), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!delayed_node_cache) return -ENOMEM; @@ -134,7 +134,7 @@ again: /* cached in the btrfs inode and can be accessed */ atomic_add(2, &node->refs); - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) { kmem_cache_free(delayed_node_cache, node); return ERR_PTR(ret); @@ -553,7 +553,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, dst_rsv = &root->fs_info->delayed_block_rsv; num_bytes = btrfs_calc_trans_metadata_size(root, 1); - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); if (!ret) { trace_btrfs_space_reservation(root->fs_info, "delayed_item", item->key.objectid, @@ -598,6 +598,29 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes = btrfs_calc_trans_metadata_size(root, 1); /* + * If our block_rsv is the delalloc block reserve then check and see if + * we have our extra reservation for updating the inode. If not fall + * through and try to reserve space quickly. + * + * We used to try and steal from the delalloc block rsv or the global + * reserve, but we'd steal a full reservation, which isn't kind. We are + * here through delalloc which means we've likely just cowed down close + * to the leaf that contains the inode, so we would steal less just + * doing the fallback inode update, so if we do end up having to steal + * from the global block rsv we hopefully only steal one or two blocks + * worth which is less likely to hurt us. + */ + if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { + spin_lock(&BTRFS_I(inode)->lock); + if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) + release = true; + else + src_rsv = NULL; + spin_unlock(&BTRFS_I(inode)->lock); + } + + /* * btrfs_dirty_inode will update the inode under btrfs_join_transaction * which doesn't reserve space for speed. This is a problem since we * still need to reserve space for this update, so try to reserve the @@ -626,51 +649,10 @@ static int btrfs_delayed_inode_reserve_metadata( num_bytes, 1); } return ret; - } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) { - spin_lock(&BTRFS_I(inode)->lock); - if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) { - spin_unlock(&BTRFS_I(inode)->lock); - release = true; - goto migrate; - } - spin_unlock(&BTRFS_I(inode)->lock); - - /* Ok we didn't have space pre-reserved. This shouldn't happen - * too often but it can happen if we do delalloc to an existing - * inode which gets dirtied because of the time update, and then - * isn't touched again until after the transaction commits and - * then we try to write out the data. First try to be nice and - * reserve something strictly for us. If not be a pain and try - * to steal from the delalloc block rsv. - */ - ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, - BTRFS_RESERVE_NO_FLUSH); - if (!ret) - goto out; - - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) - goto out; - - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { - btrfs_debug(root->fs_info, - "block rsv migrate returned %d", ret); - WARN_ON(1); - } - /* - * Ok this is a problem, let's just steal from the global rsv - * since this really shouldn't happen that often. - */ - ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, - dst_rsv, num_bytes); - goto out; } -migrate: - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); -out: /* * Migrate only takes a reservation, it doesn't touch the size of the * block_rsv. This is to simplify people who don't normally have things @@ -1188,7 +1170,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, if (ret) { btrfs_release_delayed_node(curr_node); curr_node = NULL; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -1606,15 +1588,23 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode) return 0; } -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list) +bool btrfs_readdir_get_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list) { struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *item; delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) - return; + return false; + + /* + * We can only do one readdir with delayed items at a time because of + * item->readdir_list. + */ + inode_unlock_shared(inode); + inode_lock(inode); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1641,10 +1631,13 @@ void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, * requeue or dequeue this delayed node. */ atomic_dec(&delayed_node->refs); + + return true; } -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list) +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list) { struct btrfs_delayed_item *curr, *next; @@ -1659,6 +1652,12 @@ void btrfs_put_delayed_items(struct list_head *ins_list, if (atomic_dec_and_test(&curr->refs)) kfree(curr); } + + /* + * The VFS is going to do up_read(), so we need to downgrade back to a + * read lock. + */ + downgrade_write(&inode->i_rwsem); } int btrfs_should_delete_dir_index(struct list_head *del_list, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0167853c84ae..2495b3d4075f 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -137,10 +137,12 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); void btrfs_destroy_delayed_inodes(struct btrfs_root *root); /* Used for readdir() */ -void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, - struct list_head *del_list); -void btrfs_put_delayed_items(struct list_head *ins_list, - struct list_head *del_list); +bool btrfs_readdir_get_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list); +void btrfs_readdir_put_delayed_items(struct inode *inode, + struct list_head *ins_list, + struct list_head *del_list); int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 430b3689b112..ac02e041464b 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -606,16 +605,15 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, - qrecord); - if (qexisting) + if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info, + delayed_refs, qrecord)) kfree(qrecord); } spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); - trace_add_delayed_ref_head(ref, head_ref, action); + trace_add_delayed_ref_head(fs_info, ref, head_ref, action); existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); @@ -682,7 +680,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->type = BTRFS_TREE_BLOCK_REF_KEY; full_ref->level = level; - trace_add_delayed_tree_ref(ref, full_ref, action); + trace_add_delayed_tree_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -739,7 +737,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, full_ref->objectid = owner; full_ref->offset = offset; - trace_add_delayed_data_ref(ref, full_ref, action); + trace_add_delayed_data_ref(fs_info, ref, full_ref, action); ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); @@ -861,33 +859,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 ref_root, u64 bytenr, u64 num_bytes) -{ - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *ref_head; - int ret = 0; - - if (!fs_info->quota_enabled || !is_fstree(ref_root)) - return 0; - - delayed_refs = &trans->transaction->delayed_refs; - - spin_lock(&delayed_refs->lock); - ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0); - if (!ref_head) { - ret = -ENOENT; - goto out; - } - WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root); - ref_head->qgroup_ref_root = ref_root; - ref_head->qgroup_reserved = num_bytes; -out: - spin_unlock(&delayed_refs->lock); - return ret; -} - int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -940,28 +911,28 @@ int btrfs_delayed_ref_init(void) btrfs_delayed_ref_head_cachep = kmem_cache_create( "btrfs_delayed_ref_head", sizeof(struct btrfs_delayed_ref_head), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_ref_head_cachep) goto fail; btrfs_delayed_tree_ref_cachep = kmem_cache_create( "btrfs_delayed_tree_ref", sizeof(struct btrfs_delayed_tree_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_tree_ref_cachep) goto fail; btrfs_delayed_data_ref_cachep = kmem_cache_create( "btrfs_delayed_data_ref", sizeof(struct btrfs_delayed_data_ref), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_data_ref_cachep) goto fail; btrfs_delayed_extent_op_cachep = kmem_cache_create( "btrfs_delayed_extent_op", sizeof(struct btrfs_delayed_extent_op), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_delayed_extent_op_cachep) goto fail; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c24b653c7343..43f3629760e9 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -188,7 +188,7 @@ struct btrfs_delayed_ref_root { /* * To make qgroup to skip given root. - * This is for snapshot, as btrfs_qgroup_inherit() will manully + * This is for snapshot, as btrfs_qgroup_inherit() will manually * modify counters for snapshot and its source, so we should skip * the snapshot in new_root/old_roots or it will get calculated twice */ @@ -250,9 +250,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 parent, u64 ref_root, u64 owner, u64 offset, u64 reserved, int action, struct btrfs_delayed_extent_op *extent_op); -int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, - struct btrfs_trans_handle *trans, - u64 ref_root, u64 bytenr, u64 num_bytes); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 26bcb487f958..e9bbff3c0029 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -44,9 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device *tgtdev); -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device); static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); static int btrfs_dev_replace_kthread(void *data); static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); @@ -145,7 +142,7 @@ no_valid_dev_replace_entry_found: * missing */ if (!dev_replace->srcdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -154,7 +151,7 @@ no_valid_dev_replace_entry_found: src_devid); } if (!dev_replace->tgtdev && - !btrfs_test_opt(dev_root, DEGRADED)) { + !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -305,8 +302,8 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -int btrfs_dev_replace_start(struct btrfs_root *root, - struct btrfs_ioctl_dev_replace_args *args) +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src) { struct btrfs_trans_handle *trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -315,29 +312,16 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - switch (args->start.cont_reading_from_srcdev_mode) { - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: - case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: - break; - default: - return -EINVAL; - } - - if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || - args->start.tgtdev_name[0] == '\0') - return -EINVAL; - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); - ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, - args->start.srcdev_name, - &src_device); + ret = btrfs_find_device_by_devspec(root, srcdevid, + srcdev_name, &src_device); if (ret) { mutex_unlock(&fs_info->volume_mutex); return ret; } - ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name, src_device, &tgt_device); mutex_unlock(&fs_info->volume_mutex); if (ret) @@ -364,18 +348,17 @@ int btrfs_dev_replace_start(struct btrfs_root *root, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; goto leave; } - dev_replace->cont_reading_from_srcdev_mode = - args->start.cont_reading_from_srcdev_mode; + dev_replace->cont_reading_from_srcdev_mode = read_src; WARN_ON(!src_device); dev_replace->srcdev = src_device; WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - btrfs_info_in_rcu(root->fs_info, + btrfs_info_in_rcu(fs_info, "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), @@ -396,14 +379,13 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->item_needs_writeback = 1; atomic64_set(&dev_replace->num_write_errors, 0); atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace, 1); ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); if (ret) - btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + btrfs_err(fs_info, "kobj add dev failed %d\n", ret); - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -421,11 +403,9 @@ int btrfs_dev_replace_start(struct btrfs_root *root, btrfs_device_get_total_bytes(src_device), &dev_replace->scrub_progress, 0, 1); - ret = btrfs_dev_replace_finishing(root->fs_info, ret); - /* don't warn if EINPROGRESS, someone else might be running scrub */ + ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) { - args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - ret = 0; + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; } else { WARN_ON(ret); } @@ -440,8 +420,37 @@ leave: return ret; } +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, + struct btrfs_ioctl_dev_replace_args *args) +{ + int ret; + + switch (args->start.cont_reading_from_srcdev_mode) { + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: + case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: + break; + default: + return -EINVAL; + } + + if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') || + args->start.tgtdev_name[0] == '\0') + return -EINVAL; + + ret = btrfs_dev_replace_start(root, args->start.tgtdev_name, + args->start.srcdevid, + args->start.srcdev_name, + args->start.cont_reading_from_srcdev_mode); + args->result = ret; + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS) + ret = 0; + + return ret; +} + /* - * blocked until all flighting bios are finished. + * blocked until all in-flight bios operations are finished. */ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { @@ -495,7 +504,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(root->fs_info, -1); + btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { @@ -560,10 +569,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ASSERT(list_empty(&src_device->resized_list)); tgt_device->commit_total_bytes = src_device->commit_total_bytes; tgt_device->commit_bytes_used = src_device->bytes_used; - if (fs_info->sb->s_bdev == src_device->bdev) - fs_info->sb->s_bdev = tgt_device->bdev; - if (fs_info->fs_devices->latest_bdev == src_device->bdev) - fs_info->fs_devices->latest_bdev = tgt_device->bdev; + + btrfs_assign_next_active_device(fs_info, src_device, tgt_device); + list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; @@ -626,25 +634,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( write_unlock(&em_tree->lock); } -static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid, - char *srcdev_name, - struct btrfs_device **device) -{ - int ret; - - if (srcdevid) { - ret = 0; - *device = btrfs_find_device(root->fs_info, srcdevid, NULL, - NULL); - if (!*device) - ret = -ENOENT; - } else { - ret = btrfs_find_device_missing_or_by_path(root, srcdev_name, - device); - } - return ret; -} - void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 29e3ef5f96bd..e922b42d91df 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,8 +25,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); -int btrfs_dev_replace_start(struct btrfs_root *root, +int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, struct btrfs_ioctl_dev_replace_args *args); +int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, + u64 srcdevid, char *srcdev_name, int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4e47849d7427..54bc8c7c6bcd 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -101,7 +101,7 @@ int __init btrfs_end_io_wq_init(void) btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", sizeof(struct btrfs_end_io_wq), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_end_io_wq_cache) return -ENOMEM; @@ -124,7 +124,6 @@ struct async_submit_bio { struct list_head list; extent_submit_bio_hook_t *submit_bio_start; extent_submit_bio_hook_t *submit_bio_done; - int rw; int mirror_num; unsigned long bio_flags; /* @@ -384,7 +383,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, /* * Things reading via commit roots that don't have normal protection, * like send, can have a really old block in cache that may point at a - * block that has been free'd and re-allocated. So don't clear uptodate + * block that has been freed and re-allocated. So don't clear uptodate * if we find an eb that is under IO (dirty/writeback) because we could * end up reading in the stale data and then writing it back out and * making everybody very sad. @@ -418,7 +417,7 @@ static int btrfs_check_super_csum(char *raw_disk_sb) /* * The super_block structure does not span the whole * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space - * is filled with zeros and is included in the checkum. + * is filled with zeros and is included in the checksum. */ crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE, crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); @@ -560,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root, u32 nritems = btrfs_header_nritems(leaf); int slot; - if (nritems == 0) + if (nritems == 0) { + struct btrfs_root *check_root; + + key.objectid = btrfs_header_owner(leaf); + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + check_root = btrfs_get_fs_root(root->fs_info, &key, false); + /* + * The only reason we also check NULL here is that during + * open_ctree() some roots has not yet been set up. + */ + if (!IS_ERR_OR_NULL(check_root)) { + /* if leaf is the root, then it's fine */ + if (leaf->start != + btrfs_root_bytenr(&check_root->root_item)) { + CORRUPT("non-root leaf's nritems is 0", + leaf, root, 0); + return -EIO; + } + } return 0; + } /* Check the 0 item */ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != @@ -600,7 +620,7 @@ static noinline int check_leaf(struct btrfs_root *root, /* * Check to make sure that we don't point outside of the leaf, - * just incase all the items are consistent to eachother, but + * just in case all the items are consistent to each other, but * all point outside of the leaf. */ if (btrfs_item_end_nr(leaf, slot) > @@ -613,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root, return 0; } +static int check_node(struct btrfs_root *root, struct extent_buffer *node) +{ + unsigned long nr = btrfs_header_nritems(node); + + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) { + btrfs_crit(root->fs_info, + "corrupt node: block %llu root %llu nritems %lu", + node->start, root->objectid, nr); + return -EIO; + } + return 0; +} + static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror) @@ -683,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; } + if (found_level > 0 && check_node(root, eb)) + ret = -EIO; + if (!ret) set_extent_buffer_uptodate(eb); err: @@ -727,7 +763,7 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->error = bio->bi_error; - if (bio->bi_rw & REQ_WRITE) { + if (bio_op(bio) == REQ_OP_WRITE) { if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { wq = fs_info->endio_meta_write_workers; func = btrfs_endio_meta_write_helper; @@ -797,7 +833,7 @@ static void run_one_async_start(struct btrfs_work *work) int ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->rw, async->bio, + ret = async->submit_bio_start(async->inode, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); if (ret) @@ -830,9 +866,8 @@ static void run_one_async_done(struct btrfs_work *work) return; } - async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags, - async->bio_offset); + async->submit_bio_done(async->inode, async->bio, async->mirror_num, + async->bio_flags, async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -844,7 +879,7 @@ static void run_one_async_free(struct btrfs_work *work) } int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, + struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, @@ -857,7 +892,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, return -ENOMEM; async->inode = inode; - async->rw = rw; async->bio = bio; async->mirror_num = mirror_num; async->submit_bio_start = submit_bio_start; @@ -873,7 +907,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (rw & REQ_SYNC) + if (bio->bi_opf & REQ_SYNC) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); @@ -903,9 +937,8 @@ static int btree_csum_one_bio(struct bio *bio) return ret; } -static int __btree_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, +static int __btree_submit_bio_start(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, u64 bio_offset) { /* @@ -915,7 +948,7 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, return btree_csum_one_bio(bio); } -static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, +static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { @@ -925,7 +958,7 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -944,14 +977,14 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) return 1; } -static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, +static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { int async = check_async_write(inode, bio_flags); int ret; - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) != REQ_OP_WRITE) { /* * called for a read, do the setup so that checksum validation * can happen in the async kernel threads @@ -960,21 +993,19 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, - mirror_num, 0); + ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, 0, + inode, bio, mirror_num, 0, bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); @@ -1098,7 +1129,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr) struct inode *btree_inode = root->fs_info->btree_inode; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, WAIT_NONE, btree_get_extent, 0); @@ -1114,7 +1145,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) + if (IS_ERR(buf)) return 0; set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags); @@ -1146,8 +1177,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr) { - if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr); + if (btrfs_is_testing(root->fs_info)) + return alloc_test_extent_buffer(root->fs_info, bytenr, + root->nodesize); return alloc_extent_buffer(root->fs_info, bytenr); } @@ -1171,8 +1203,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, int ret; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { @@ -1232,6 +1264,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { + bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); root->node = NULL; root->commit_root = NULL; root->sectorsize = sectorsize; @@ -1286,14 +1319,14 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; - if (fs_info) + if (!dummy) extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - if (fs_info) + if (!dummy) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; @@ -1314,15 +1347,20 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(void) +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize) { struct btrfs_root *root; - root = btrfs_alloc_root(NULL, GFP_KERNEL); + if (!fs_info) + return ERR_PTR(-EINVAL); + + root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(4096, 4096, 4096, root, NULL, 1); - set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); + /* We don't use the stripesize in selftest, set it as sectorsize */ + __setup_root(nodesize, sectorsize, sectorsize, root, fs_info, + BTRFS_ROOT_TREE_OBJECTID); root->alloc_bytenr = 0; return root; @@ -1597,14 +1635,14 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) - goto free_writers; + goto fail; mutex_lock(&root->objectid_mutex); ret = btrfs_find_highest_objectid(root, &root->highest_objectid); if (ret) { mutex_unlock(&root->objectid_mutex); - goto free_root_dev; + goto fail; } ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1612,19 +1650,13 @@ int btrfs_init_fs_root(struct btrfs_root *root) mutex_unlock(&root->objectid_mutex); return 0; - -free_root_dev: - free_anon_bdev(root->anon_dev); -free_writers: - btrfs_free_subvolume_writers(root->subv_writers); fail: - kfree(root->free_ino_ctl); - kfree(root->free_ino_pinned); + /* the caller is responsible to call free_fs_root */ return ret; } -static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_id) +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id) { struct btrfs_root *root; @@ -1640,7 +1672,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) return ret; @@ -1803,6 +1835,13 @@ static int cleaner_kthread(void *arg) if (btrfs_need_cleaner_sleep(root)) goto sleep; + /* + * Do not do anything if we might cause open_ctree() to block + * before we have finished mounting the filesystem. + */ + if (!root->fs_info->open) + goto sleep; + if (!mutex_trylock(&root->fs_info->cleaner_mutex)) goto sleep; @@ -2296,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; fs_info->qgroup_ulist = NULL; + fs_info->qgroup_rescan_running = false; mutex_init(&fs_info->qgroup_rescan_lock); } @@ -2306,17 +2346,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); + btrfs_alloc_workqueue(fs_info, "worker", + flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "delalloc", + flags, max_active, 2); fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "flush_delalloc", + flags, max_active, 0); fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0); /* * a higher idle thresh on the submit workers makes it much more @@ -2324,41 +2366,48 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, * devices */ fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, + btrfs_alloc_workqueue(fs_info, "submit", flags, min_t(u64, fs_devices->num_devices, max_active), 64); fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); /* * endios are largely parallel and should have a very * low idle thresh */ fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-meta", flags, + max_active, 4); fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, + max_active, 2); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, + max_active, 4); fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0); fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "endio-write", flags, + max_active, 2); fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "freespace-write", flags, + max_active, 0); fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, + max_active, 0); fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); + btrfs_alloc_workqueue(fs_info, "readahead", flags, + max_active, 2); fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, + btrfs_alloc_workqueue(fs_info, "extent-refs", flags, min_t(u64, fs_devices->num_devices, max_active), 8); @@ -2417,7 +2466,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_std_error(tree_root->fs_info, ret, + btrfs_handle_fs_error(tree_root->fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2613,6 +2662,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic_set(&fs_info->reada_works_cnt, 0); atomic64_set(&fs_info->tree_mod_seq, 0); + fs_info->fs_frozen = 0; fs_info->sb = sb; fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; @@ -2713,7 +2763,7 @@ int open_ctree(struct super_block *sb, * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). */ if (btrfs_check_super_csum(bh->b_data)) { - printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); + btrfs_err(fs_info, "superblock checksum mismatch"); err = -EINVAL; brelse(bh); goto fail_alloc; @@ -2733,7 +2783,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); if (ret) { - printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); + btrfs_err(fs_info, "superblock contains fatal errors"); err = -EINVAL; goto fail_alloc; } @@ -2768,9 +2818,9 @@ int open_ctree(struct super_block *sb, features = btrfs_super_incompat_flags(disk_super) & ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { - printk(KERN_ERR "BTRFS: couldn't mount because of " - "unsupported optional features (%Lx).\n", - features); + btrfs_err(fs_info, + "cannot mount because of unsupported optional features (%llx)", + features); err = -EINVAL; goto fail_alloc; } @@ -2781,7 +2831,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_INFO "BTRFS: has skinny extents\n"); + btrfs_info(fs_info, "has skinny extents"); /* * flag our filesystem as having big metadata blocks if @@ -2789,13 +2839,14 @@ int open_ctree(struct super_block *sb, */ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); + btrfs_info(fs_info, + "flagging fs with big metadata feature"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = btrfs_super_stripesize(disk_super); + stripesize = sectorsize; fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); @@ -2805,9 +2856,9 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { - printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " - "are not allowed for mixed block groups on %s\n", - sb->s_id); + btrfs_err(fs_info, +"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups", + nodesize, sectorsize); goto fail_alloc; } @@ -2820,8 +2871,8 @@ int open_ctree(struct super_block *sb, features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " - "unsupported option features (%Lx).\n", + btrfs_err(fs_info, + "cannot mount read-write because of unsupported optional features (%llx)", features); err = -EINVAL; goto fail_alloc; @@ -2850,8 +2901,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_ERR "BTRFS: failed to read the system " - "array on %s\n", sb->s_id); + btrfs_err(fs_info, "failed to read the system array: %d", ret); goto fail_sb_buffer; } @@ -2865,8 +2915,7 @@ int open_ctree(struct super_block *sb, generation); if (IS_ERR(chunk_root->node) || !extent_buffer_uptodate(chunk_root->node)) { - printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk root"); if (!IS_ERR(chunk_root->node)) free_extent_buffer(chunk_root->node); chunk_root->node = NULL; @@ -2880,8 +2929,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read chunk tree: %d", ret); goto fail_tree_roots; } @@ -2892,8 +2940,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_ERR "BTRFS: failed to read devices on %s\n", - sb->s_id); + btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; } @@ -2905,8 +2952,7 @@ retry_root_backup: generation); if (IS_ERR(tree_root->node) || !extent_buffer_uptodate(tree_root->node)) { - printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", - sb->s_id); + btrfs_warn(fs_info, "failed to read tree root"); if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); tree_root->node = NULL; @@ -2938,20 +2984,19 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to recover balance\n"); + btrfs_err(fs_info, "failed to recover balance: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_stats(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", - ret); + btrfs_err(fs_info, "failed to init dev_stats: %d", ret); goto fail_block_groups; } ret = btrfs_init_dev_replace(fs_info); if (ret) { - pr_err("BTRFS: failed to init dev_replace: %d\n", ret); + btrfs_err(fs_info, "failed to init dev_replace: %d", ret); goto fail_block_groups; } @@ -2959,31 +3004,33 @@ retry_root_backup: ret = btrfs_sysfs_add_fsid(fs_devices, NULL); if (ret) { - pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", + ret); goto fail_block_groups; } ret = btrfs_sysfs_add_device(fs_devices); if (ret) { - pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs device interface: %d", + ret); goto fail_fsdev_sysfs; } ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { - pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); + btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); if (ret) { - printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); + btrfs_err(fs_info, "failed to initialize space info: %d", ret); goto fail_sysfs; } ret = btrfs_read_block_groups(fs_info->extent_root); if (ret) { - printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); + btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; } fs_info->num_tolerated_disk_barrier_failures = @@ -2991,7 +3038,8 @@ retry_root_backup: if (fs_info->fs_devices->missing_devices > fs_info->num_tolerated_disk_barrier_failures && !(sb->s_flags & MS_RDONLY)) { - pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n", + btrfs_warn(fs_info, +"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed", fs_info->fs_devices->missing_devices, fs_info->num_tolerated_disk_barrier_failures); goto fail_sysfs; @@ -3008,30 +3056,30 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(tree_root, SSD) && - !btrfs_test_opt(tree_root, NOSSD) && + if (!btrfs_test_opt(tree_root->fs_info, SSD) && + !btrfs_test_opt(tree_root->fs_info, NOSSD) && !fs_info->fs_devices->rotating) { - printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " - "mode\n"); + btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); } /* - * Mount does not set all options immediatelly, we can do it now and do + * Mount does not set all options immediately, we can do it now and do * not have to wait for transaction commit */ btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { + if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(tree_root, fs_devices, - btrfs_test_opt(tree_root, + btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); if (ret) - printk(KERN_WARNING "BTRFS: failed to initialize" - " integrity check module %s\n", sb->s_id); + btrfs_warn(fs_info, + "failed to initialize integrity check module: %d", + ret); } #endif ret = btrfs_read_qgroup_config(fs_info); @@ -3040,7 +3088,7 @@ retry_root_backup: /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && - !btrfs_test_opt(tree_root, NOLOGREPLAY)) { + !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) { ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3061,8 +3109,8 @@ retry_root_backup: ret = btrfs_recover_relocation(tree_root); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { - printk(KERN_WARNING - "BTRFS: failed to recover relocation\n"); + btrfs_warn(fs_info, "failed to recover relocation: %d", + ret); err = -EINVAL; goto fail_qgroup; } @@ -3081,13 +3129,13 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; - if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: creating free space tree\n"); + btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create free space tree: %d", ret); close_ctree(tree_root); return ret; } @@ -3104,49 +3152,49 @@ retry_root_backup: ret = btrfs_resume_balance_async(fs_info); if (ret) { - printk(KERN_WARNING "BTRFS: failed to resume balance\n"); + btrfs_warn(fs_info, "failed to resume balance: %d", ret); close_ctree(tree_root); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { - pr_warn("BTRFS: failed to resume dev_replace\n"); + btrfs_warn(fs_info, "failed to resume device replace: %d", ret); close_ctree(tree_root); return ret; } btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - pr_info("BTRFS: clearing free space tree\n"); + btrfs_info(fs_info, "clearing free space tree"); ret = btrfs_clear_free_space_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to clear free space tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); close_ctree(tree_root); return ret; } } if (!fs_info->uuid_root) { - pr_info("BTRFS: creating UUID tree\n"); + btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to create the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to create the UUID tree: %d", ret); close_ctree(tree_root); return ret; } - } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { - pr_info("BTRFS: checking UUID tree\n"); + btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { - pr_warn("BTRFS: failed to check the UUID tree %d\n", - ret); + btrfs_warn(fs_info, + "failed to check the UUID tree: %d", ret); close_ctree(tree_root); return ret; } @@ -3216,7 +3264,7 @@ fail: return err; recovery_tree_root: - if (!btrfs_test_opt(tree_root, USEBACKUPROOT)) + if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT)) goto fail_tree_roots; free_root_pointers(fs_info, 0); @@ -3245,7 +3293,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) btrfs_warn_rl_in_rcu(device->dev_root->fs_info, "lost page write due to IO error on %s", rcu_str_deref(device->name)); - /* note, we dont' set_buffer_write_io_error because we have + /* note, we don't set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ clear_buffer_uptodate(bh); @@ -3410,9 +3458,9 @@ static int write_dev_supers(struct btrfs_device *device, * to go down lazy. */ if (i == 0) - ret = btrfsic_submit_bh(WRITE_FUA, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh); else - ret = btrfsic_submit_bh(WRITE_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); if (ret) errors++; } @@ -3476,12 +3524,13 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; bio_get(bio); - btrfsic_submit_bio(WRITE_FLUSH, bio); + btrfsic_submit_bio(bio); return 0; } @@ -3631,7 +3680,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - do_barriers = !btrfs_test_opt(root, NOBARRIER); + do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER); backup_super_roots(root->fs_info); sb = root->fs_info->super_for_commit; @@ -3646,7 +3695,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) { mutex_unlock( &root->fs_info->fs_devices->device_list_mutex); - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "errors while submitting device barriers."); return ret; } @@ -3686,7 +3735,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3704,7 +3753,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_std_error(root->fs_info, -EIO, + btrfs_handle_fs_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3729,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log(NULL, root); + if (root->reloc_root) { + free_extent_buffer(root->reloc_root->node); + free_extent_buffer(root->reloc_root->commit_root); + btrfs_put_fs_root(root->reloc_root); + root->reloc_root = NULL; + } + } if (root->free_ino_pinned) __btrfs_remove_free_space_cache(root->free_ino_pinned); @@ -3841,7 +3897,7 @@ void close_ctree(struct btrfs_root *root) smp_mb(); /* wait for the qgroup rescan worker to stop */ - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); @@ -3915,7 +3971,7 @@ void close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY)) + if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY)) btrfsic_unmount(root, fs_info->fs_devices); #endif @@ -4120,6 +4176,16 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ + if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + btrfs_err(fs_info, "bytes_used is too small %llu", + btrfs_super_bytes_used(sb)); + ret = -EINVAL; + } + if (!is_power_of_2(btrfs_super_stripesize(sb))) { + btrfs_err(fs_info, "invalid stripesize %u", + btrfs_super_stripesize(sb)); + ret = -EINVAL; + } if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", btrfs_super_num_devices(sb)); @@ -4357,7 +4423,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, if (ret) break; - clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); + clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = btrfs_find_tree_block(root->fs_info, start); start += root->nodesize; @@ -4392,7 +4458,7 @@ again: if (ret) break; - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); btrfs_error_unpin_extent_range(root, start, end); cond_resched(); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 8e79d0070bcf..f19a982f5a4f 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_id); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info); @@ -90,7 +92,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -struct btrfs_root *btrfs_alloc_dummy_root(void); +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, + u32 sectorsize, u32 nodesize); #endif /* @@ -122,7 +125,7 @@ void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, - int rw, struct bio *bio, int mirror_num, + struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84e060eb0de8..665da8f66ff1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -60,21 +60,6 @@ enum { CHUNK_ALLOC_FORCE = 2, }; -/* - * Control how reservations are dealt with. - * - * RESERVE_FREE - freeing a reservation. - * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for - * ENOSPC accounting - * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update - * bytes_may_use as the ENOSPC accounting is done elsewhere - */ -enum { - RESERVE_FREE = 0, - RESERVE_ALLOC = 1, - RESERVE_ALLOC_NO_ACCOUNT = 2, -}; - static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc); @@ -104,13 +89,24 @@ static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups); -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, - int delalloc); +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc); +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes, int reserved); +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush); +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -231,9 +227,9 @@ static int add_excluded_extent(struct btrfs_root *root, { u64 end = start + num_bytes - 1; set_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); set_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); return 0; } @@ -246,9 +242,9 @@ static void free_excluded_extents(struct btrfs_root *root, end = start + cache->key.offset - 1; clear_extent_bits(&root->fs_info->freed_extents[0], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); clear_extent_bits(&root->fs_info->freed_extents[1], - start, end, EXTENT_UPTODATE, GFP_NOFS); + start, end, EXTENT_UPTODATE); } static int exclude_super_stripes(struct btrfs_root *root, @@ -980,7 +976,7 @@ out_free: * event that tree block loses its owner tree's reference and do the * back refs conversion. * - * When a tree block is COW'd through a tree, there are four cases: + * When a tree block is COWed through a tree, there are four cases: * * The reference count of the block is one and the tree is the block's * owner tree. Nothing to do in this case. @@ -2042,8 +2038,13 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, struct btrfs_bio *bbio = NULL; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are discarding. + */ + btrfs_bio_counter_inc_blocked(root->fs_info); /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(root->fs_info, REQ_DISCARD, + ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD, bytenr, &num_bytes, &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { @@ -2074,6 +2075,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } btrfs_put_bbio(bbio); } + btrfs_bio_counter_dec(root->fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; @@ -2164,7 +2166,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -2188,7 +2190,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(node, ref, node->action); + trace_run_delayed_data_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; @@ -2343,7 +2345,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(node, ref, node->action); + trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; @@ -2407,7 +2409,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(node, head, node->action); + trace_run_delayed_ref_head(root->fs_info, node, head, + node->action); if (insert_reserved) { btrfs_pin_extent(root, node->bytenr, @@ -2595,7 +2598,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } /* - * Need to drop our head ref lock and re-aqcuire the + * Need to drop our head ref lock and re-acquire the * delayed ref lock and then re-check to make sure * nobody got added. */ @@ -2747,7 +2750,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) /* * We don't ever fill up leaves all the way so multiply by 2 just to be - * closer to what we're really going to want to ouse. + * closer to what we're really going to want to use. */ return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } @@ -2762,7 +2765,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) u64 num_csums_per_leaf; u64 num_csums; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + csum_size = BTRFS_MAX_ITEM_SIZE(root); num_csums_per_leaf = div64_u64(csum_size, (u64)btrfs_super_csum_size(root->fs_info->super_copy)); num_csums = div64_u64(csum_bytes, root->sectorsize); @@ -2829,6 +2832,7 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, struct async_delayed_refs { struct btrfs_root *root; + u64 transid; int count; int error; int sync; @@ -2844,6 +2848,10 @@ static void delayed_ref_async_start(struct btrfs_work *work) async = container_of(work, struct async_delayed_refs, work); + /* if the commit is already started, we don't need to wait here */ + if (btrfs_transaction_blocked(async->root->fs_info)) + goto done; + trans = btrfs_join_transaction(async->root); if (IS_ERR(trans)) { async->error = PTR_ERR(trans); @@ -2851,14 +2859,19 @@ static void delayed_ref_async_start(struct btrfs_work *work) } /* - * trans->sync means that when we call end_transaciton, we won't + * trans->sync means that when we call end_transaction, we won't * wait on delayed refs */ trans->sync = true; + + /* Don't bother flushing if we got into a different transaction */ + if (trans->transid > async->transid) + goto end; + ret = btrfs_run_delayed_refs(trans, async->root, async->count); if (ret) async->error = ret; - +end: ret = btrfs_end_transaction(trans, async->root); if (ret && !async->error) async->error = ret; @@ -2870,7 +2883,7 @@ done: } int btrfs_async_run_delayed_refs(struct btrfs_root *root, - unsigned long count, int wait) + unsigned long count, u64 transid, int wait) { struct async_delayed_refs *async; int ret; @@ -2882,6 +2895,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, async->root = root->fs_info->tree_root; async->count = count; async->error = 0; + async->transid = transid; if (wait) async->sync = 1; else @@ -2943,7 +2957,7 @@ again: trans->can_flush_pending_bgs = false; ret = __btrfs_run_delayed_refs(trans, root, count); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -3207,7 +3221,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, u64, u64, u64, u64, u64, u64); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; ref_root = btrfs_header_owner(buf); @@ -3402,7 +3416,7 @@ again: * transaction, this only happens in really bad situations * anyway. */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } WARN_ON(ret); @@ -3420,7 +3434,7 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE)) { + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3473,7 +3487,6 @@ again: dcs = BTRFS_DC_SETUP; else if (ret == -ENOSPC) set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); - btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -3497,7 +3510,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, struct btrfs_path *path; if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(root, SPACE_CACHE)) + !btrfs_test_opt(root->fs_info, SPACE_CACHE)) return 0; path = btrfs_alloc_path(); @@ -3642,7 +3655,7 @@ again: } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } } @@ -3788,7 +3801,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache); } if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } /* if its not on the io list, we need to put the block group */ @@ -3824,6 +3837,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + bool ret = true; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg) + return false; + + spin_lock(&bg->lock); + if (bg->ro) + ret = false; + else + atomic_inc(&bg->nocow_writers); + spin_unlock(&bg->lock); + + /* no put on block group, done by btrfs_dec_nocow_writers */ + if (!ret) + btrfs_put_block_group(bg); + + return ret; + +} + +void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, bytenr); + ASSERT(bg); + if (atomic_dec_and_test(&bg->nocow_writers)) + wake_up_atomic_t(&bg->nocow_writers); + /* + * Once for our lookup and once for the lookup done by a previous call + * to btrfs_inc_nocow_writers() + */ + btrfs_put_block_group(bg); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +{ + wait_on_atomic_t(&bg->nocow_writers, + btrfs_wait_nocow_writers_atomic_t, + TASK_UNINTERRUPTIBLE); +} + static const char *alloc_name(u64 flags) { switch (flags) { @@ -3843,6 +3909,7 @@ static const char *alloc_name(u64 flags) static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, + u64 bytes_readonly, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; @@ -3863,8 +3930,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; + found->bytes_readonly += bytes_readonly; if (total_bytes > 0) found->full = 0; + space_info_add_new_bytes(info, found, total_bytes - + bytes_used - bytes_readonly); spin_unlock(&found->lock); *space_info = found; return 0; @@ -3890,7 +3960,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_used = bytes_used * factor; found->bytes_pinned = 0; found->bytes_reserved = 0; - found->bytes_readonly = 0; + found->bytes_readonly = bytes_readonly; found->bytes_may_use = 0; found->full = 0; found->max_extent_size = 0; @@ -3899,6 +3969,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->flush = 0; init_waitqueue_head(&found->wait); INIT_LIST_HEAD(&found->ro_bgs); + INIT_LIST_HEAD(&found->tickets); + INIT_LIST_HEAD(&found->priority_tickets); ret = kobject_init_and_add(&found->kobj, &space_info_ktype, info->space_info_kobj, "%s", @@ -4141,7 +4213,7 @@ commit_trans: if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } trans = btrfs_join_transaction(root); @@ -4199,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) if (ret < 0) return ret; - /* - * Use new btrfs_qgroup_reserve_data to reserve precious data space - * - * TODO: Find a good method to avoid reserve data space for NOCOW - * range, but don't impact performance on quota disable case. - */ + /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, start, len); + if (ret) + btrfs_free_reserved_data_space_noquota(inode, start, len); return ret; } @@ -4243,7 +4312,7 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, * Called if we need to clear a data reservation for this inode * Normally in a error case. * - * This one will handle the per-indoe data rsv map for accurate reserved + * This one will handle the per-inode data rsv map for accurate reserved * space framework. */ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) @@ -4357,7 +4426,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + btrfs_calc_trans_metadata_size(root, 1); - if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); dump_space_info(info, 0, 0); @@ -4385,6 +4454,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans, } } +/* + * If force is CHUNK_ALLOC_FORCE: + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + * If force is NOT CHUNK_ALLOC_FORCE: + * - return 0 if it doesn't need to allocate a new chunk, + * - return 1 if it successfully allocates a chunk, + * - return errors including -ENOSPC otherwise. + */ static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force) { @@ -4400,7 +4478,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, space_info = __find_space_info(extent_root->fs_info, flags); if (!space_info) { ret = update_space_info(extent_root->fs_info, flags, - 0, 0, &space_info); + 0, 0, 0, &space_info); BUG_ON(ret); /* -ENOMEM */ } BUG_ON(!space_info); /* Logic error */ @@ -4502,7 +4580,7 @@ out: */ if (trans->can_flush_pending_bgs && trans->chunk_bytes_reserved >= (u64)SZ_2M) { - btrfs_create_pending_block_groups(trans, trans->root); + btrfs_create_pending_block_groups(trans, extent_root); btrfs_trans_release_chunk_metadata(trans); } return ret; @@ -4512,12 +4590,19 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - u64 profile = btrfs_get_alloc_profile(root, 0); + struct btrfs_block_rsv *global_rsv; + u64 profile; u64 space_size; u64 avail; u64 used; + /* Don't overcommit when in mixed mode. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) + return 0; + + BUG_ON(root->fs_info == NULL); + global_rsv = &root->fs_info->global_block_rsv; + profile = btrfs_get_alloc_profile(root, 0); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; @@ -4583,7 +4668,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, */ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, nr_items); + btrfs_wait_ordered_roots(root->fs_info, nr_items, + 0, (u64)-1); } } @@ -4620,7 +4706,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, /* Calc the number of the pages we need flush for space reservation */ items = calc_reclaim_items_nr(root, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; block_rsv = &root->fs_info->delalloc_block_rsv; @@ -4632,7 +4718,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, if (trans) return; if (wait_ordered) - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); return; } @@ -4667,11 +4754,17 @@ skip_async: spin_unlock(&space_info->lock); break; } + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } spin_unlock(&space_info->lock); loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(root->fs_info, items); + btrfs_wait_ordered_roots(root->fs_info, items, + 0, (u64)-1); } else { time_left = schedule_timeout_killable(1); if (time_left) @@ -4734,13 +4827,11 @@ commit: return btrfs_commit_transaction(trans, root); } -enum flush_state { - FLUSH_DELAYED_ITEMS_NR = 1, - FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELALLOC = 3, - FLUSH_DELALLOC_WAIT = 4, - ALLOC_CHUNK = 5, - COMMIT_TRANS = 6, +struct reserve_ticket { + u64 bytes; + int error; + struct list_head list; + wait_queue_head_t wait; }; static int flush_space(struct btrfs_root *root, @@ -4782,7 +4873,7 @@ static int flush_space(struct btrfs_root *root, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); btrfs_end_transaction(trans, root); - if (ret == -ENOSPC) + if (ret > 0 || ret == -ENOSPC) ret = 0; break; case COMMIT_TRANS: @@ -4793,6 +4884,8 @@ static int flush_space(struct btrfs_root *root, break; } + trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes, + orig_bytes, state, ret); return ret; } @@ -4800,17 +4893,22 @@ static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, struct btrfs_space_info *space_info) { + struct reserve_ticket *ticket; u64 used; u64 expected; - u64 to_reclaim; + u64 to_reclaim = 0; + + list_for_each_entry(ticket, &space_info->tickets, list) + to_reclaim += ticket->bytes; + list_for_each_entry(ticket, &space_info->priority_tickets, list) + to_reclaim += ticket->bytes; + if (to_reclaim) + return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL)) { - to_reclaim = 0; - goto out; - } + BTRFS_RESERVE_FLUSH_ALL)) + return 0; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + @@ -4826,14 +4924,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, to_reclaim = 0; to_reclaim = min(to_reclaim, space_info->bytes_may_use + space_info->bytes_reserved); -out: - spin_unlock(&space_info->lock); - return to_reclaim; } static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info, u64 used) + struct btrfs_root *root, u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); @@ -4841,158 +4936,216 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - return (used >= thresh && !btrfs_fs_closing(fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); + if (!btrfs_calc_reclaim_metadata_size(root, space_info)) + return 0; + + return (used >= thresh && !btrfs_fs_closing(root->fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, + &root->fs_info->fs_state)); } -static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info, - int flush_state) +static void wake_all_tickets(struct list_head *head) { - u64 used; + struct reserve_ticket *ticket; - spin_lock(&space_info->lock); - /* - * We run out of space and have not got any free space via flush_space, - * so don't bother doing async reclaim. - */ - if (flush_state > COMMIT_TRANS && space_info->full) { - spin_unlock(&space_info->lock); - return 0; + while (!list_empty(head)) { + ticket = list_first_entry(head, struct reserve_ticket, list); + list_del_init(&ticket->list); + ticket->error = -ENOSPC; + wake_up(&ticket->wait); } - - used = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly + - space_info->bytes_may_use; - if (need_do_async_reclaim(space_info, fs_info, used)) { - spin_unlock(&space_info->lock); - return 1; - } - spin_unlock(&space_info->lock); - - return 0; } +/* + * This is for normal flushers, we can wait all goddamned day if we want to. We + * will loop and continuously try to flush as long as we are making progress. + * We count progress as clearing off tickets each time we have to loop. + */ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) { struct btrfs_fs_info *fs_info; struct btrfs_space_info *space_info; u64 to_reclaim; int flush_state; + int commit_cycles = 0; + u64 last_tickets_id; fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&space_info->lock); to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, space_info); - if (!to_reclaim) + if (!to_reclaim) { + space_info->flush = 0; + spin_unlock(&space_info->lock); return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); flush_state = FLUSH_DELAYED_ITEMS_NR; do { + struct reserve_ticket *ticket; + int ret; + + ret = flush_space(fs_info->fs_root, space_info, to_reclaim, + to_reclaim, flush_state); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + ticket = list_first_entry(&space_info->tickets, + struct reserve_ticket, list); + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = FLUSH_DELAYED_ITEMS_NR; + if (commit_cycles) + commit_cycles--; + } + + if (flush_state > COMMIT_TRANS) { + commit_cycles++; + if (commit_cycles > 2) { + wake_all_tickets(&space_info->tickets); + space_info->flush = 0; + } else { + flush_state = FLUSH_DELAYED_ITEMS_NR; + } + } + spin_unlock(&space_info->lock); + } while (flush_state <= COMMIT_TRANS); +} + +void btrfs_init_async_reclaim_work(struct work_struct *work) +{ + INIT_WORK(work, btrfs_async_reclaim_metadata_space); +} + +static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + u64 to_reclaim; + int flush_state = FLUSH_DELAYED_ITEMS_NR; + + spin_lock(&space_info->lock); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root, + space_info); + if (!to_reclaim) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + + do { flush_space(fs_info->fs_root, space_info, to_reclaim, to_reclaim, flush_state); flush_state++; - if (!btrfs_need_do_async_reclaim(space_info, fs_info, - flush_state)) + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); return; + } + spin_unlock(&space_info->lock); + + /* + * Priority flushers can't wait on delalloc without + * deadlocking. + */ + if (flush_state == FLUSH_DELALLOC || + flush_state == FLUSH_DELALLOC_WAIT) + flush_state = ALLOC_CHUNK; } while (flush_state < COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +static int wait_reserve_ticket(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, u64 orig_bytes) + { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + DEFINE_WAIT(wait); + int ret = 0; + + spin_lock(&space_info->lock); + while (ticket->bytes > 0 && ticket->error == 0) { + ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE); + if (ret) { + ret = -EINTR; + break; + } + spin_unlock(&space_info->lock); + + schedule(); + + finish_wait(&ticket->wait, &wait); + spin_lock(&space_info->lock); + } + if (!ret) + ret = ticket->error; + if (!list_empty(&ticket->list)) + list_del_init(&ticket->list); + if (ticket->bytes && ticket->bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket->bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + } + spin_unlock(&space_info->lock); + + return ret; } /** * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space * @root - the root we're allocating for - * @block_rsv - the block_rsv we're allocating for + * @space_info - the space info we want to allocate from * @orig_bytes - the number of bytes we want * @flush - whether or not we can flush to make our reservation * - * This will reserve orgi_bytes number of bytes from the space info associated + * This will reserve orig_bytes number of bytes from the space info associated * with the block_rsv. If there is not enough space it will make an attempt to * flush out space to make room. It will do this by flushing delalloc if * possible or committing the transaction. If flush is 0 then no attempts to * regain reservations will be made and this will fail if there is not enough * space already. */ -static int reserve_metadata_bytes(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_space_info *space_info, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *space_info = block_rsv->space_info; + struct reserve_ticket ticket; u64 used; - u64 num_bytes = orig_bytes; - int flush_state = FLUSH_DELAYED_ITEMS_NR; int ret = 0; - bool flushing = false; - -again: - ret = 0; - spin_lock(&space_info->lock); - /* - * We only want to wait if somebody other than us is flushing and we - * are actually allowed to flush all things. - */ - while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing && - space_info->flush) { - spin_unlock(&space_info->lock); - /* - * If we have a trans handle we can't wait because the flusher - * may have to commit the transaction, which would mean we would - * deadlock since we are waiting for the flusher to finish, but - * hold the current transaction open. - */ - if (current->journal_info) - return -EAGAIN; - ret = wait_event_killable(space_info->wait, !space_info->flush); - /* Must have been killed, return */ - if (ret) - return -EINTR; - spin_lock(&space_info->lock); - } + ASSERT(orig_bytes); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + spin_lock(&space_info->lock); ret = -ENOSPC; used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; /* - * The idea here is that we've not already over-reserved the block group - * then we can go ahead and save our reservation first and then start - * flushing if we need to. Otherwise if we've already overcommitted - * lets start flushing stuff first and then come back and try to make - * our reservation. + * If we have enough space then hooray, make our reservation and carry + * on. If not see if we can overcommit, and if we can, hooray carry on. + * If not things get more complicated. */ - if (used <= space_info->total_bytes) { - if (used + orig_bytes <= space_info->total_bytes) { - space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, orig_bytes, 1); - ret = 0; - } else { - /* - * Ok set num_bytes to orig_bytes since we aren't - * overocmmitted, this way we only try and reclaim what - * we need. - */ - num_bytes = orig_bytes; - } - } else { - /* - * Ok we're over committed, set num_bytes to the overcommitted - * amount plus the amount of bytes that we need for this - * reservation. - */ - num_bytes = used - space_info->total_bytes + - (orig_bytes * 2); - } - - if (ret && can_overcommit(root, space_info, orig_bytes, flush)) { + if (used + orig_bytes <= space_info->total_bytes) { + space_info->bytes_may_use += orig_bytes; + trace_btrfs_space_reservation(root->fs_info, "space_info", + space_info->flags, orig_bytes, + 1); + ret = 0; + } else if (can_overcommit(root, space_info, orig_bytes, flush)) { space_info->bytes_may_use += orig_bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", space_info->flags, orig_bytes, @@ -5001,16 +5154,31 @@ again: } /* - * Couldn't make our reservation, save our place so while we're trying - * to reclaim space we can actually use it instead of somebody else - * stealing it from us. + * If we couldn't make a reservation then setup our reservation ticket + * and kick the async worker if it's not already running. * - * We make the other tasks wait for the flush only when we can flush - * all things. + * If we are a priority flusher then we just need to add our ticket to + * the list and we will do our own flushing further down. */ if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { - flushing = true; - space_info->flush = 1; + ticket.bytes = orig_bytes; + ticket.error = 0; + init_waitqueue_head(&ticket.wait); + if (flush == BTRFS_RESERVE_FLUSH_ALL) { + list_add_tail(&ticket.list, &space_info->tickets); + if (!space_info->flush) { + space_info->flush = 1; + trace_btrfs_trigger_flush(root->fs_info, + space_info->flags, + orig_bytes, flush, + "enospc"); + queue_work(system_unbound_wq, + &root->fs_info->async_reclaim_work); + } + } else { + list_add_tail(&ticket.list, + &space_info->priority_tickets); + } } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; /* @@ -5019,39 +5187,67 @@ again: * the async reclaim as we will panic. */ if (!root->fs_info->log_root_recovering && - need_do_async_reclaim(space_info, root->fs_info, used) && - !work_busy(&root->fs_info->async_reclaim_work)) + need_do_async_reclaim(space_info, root, used) && + !work_busy(&root->fs_info->async_reclaim_work)) { + trace_btrfs_trigger_flush(root->fs_info, + space_info->flags, + orig_bytes, flush, + "preempt"); queue_work(system_unbound_wq, &root->fs_info->async_reclaim_work); + } } spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) - goto out; + return ret; - ret = flush_space(root, space_info, num_bytes, orig_bytes, - flush_state); - flush_state++; + if (flush == BTRFS_RESERVE_FLUSH_ALL) + return wait_reserve_ticket(root->fs_info, space_info, &ticket, + orig_bytes); - /* - * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock - * would happen. So skip delalloc flush. - */ - if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - (flush_state == FLUSH_DELALLOC || - flush_state == FLUSH_DELALLOC_WAIT)) - flush_state = ALLOC_CHUNK; + ret = 0; + priority_reclaim_metadata_space(root->fs_info, space_info, &ticket); + spin_lock(&space_info->lock); + if (ticket.bytes) { + if (ticket.bytes < orig_bytes) { + u64 num_bytes = orig_bytes - ticket.bytes; + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(root->fs_info, + "space_info", space_info->flags, + num_bytes, 0); - if (!ret) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_LIMIT && - flush_state < COMMIT_TRANS) - goto again; - else if (flush == BTRFS_RESERVE_FLUSH_ALL && - flush_state <= COMMIT_TRANS) - goto again; + } + list_del_init(&ticket.list); + ret = -ENOSPC; + } + spin_unlock(&space_info->lock); + ASSERT(list_empty(&ticket.list)); + return ret; +} -out: +/** + * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space + * @root - the root we're allocating for + * @block_rsv - the block_rsv we're allocating for + * @orig_bytes - the number of bytes we want + * @flush - whether or not we can flush to make our reservation + * + * This will reserve orgi_bytes number of bytes from the space info associated + * with the block_rsv. If there is not enough space it will make an attempt to + * flush out space to make room. It will do this by flushing delalloc if + * possible or committing the transaction. If flush is 0 then no attempts to + * regain reservations will be made and this will fail if there is not enough + * space already. + */ +static int reserve_metadata_bytes(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) +{ + int ret; + + ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, + flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { struct btrfs_block_rsv *global_rsv = @@ -5064,13 +5260,8 @@ out: if (ret == -ENOSPC) trace_btrfs_space_reservation(root->fs_info, "space_info:enospc", - space_info->flags, orig_bytes, 1); - if (flushing) { - spin_lock(&space_info->lock); - space_info->flush = 0; - wake_up_all(&space_info->wait); - spin_unlock(&space_info->lock); - } + block_rsv->space_info->flags, + orig_bytes, 1); return ret; } @@ -5146,6 +5337,110 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, return 0; } +/* + * This is for space we already have accounted in space_info->bytes_may_use, so + * basically when we're returning space from block_rsv's. + */ +static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head; + u64 used; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH; + bool check_overcommit = false; + + spin_lock(&space_info->lock); + head = &space_info->priority_tickets; + + /* + * If we are over our limit then we need to check and see if we can + * overcommit, and if we can't then we just need to free up our space + * and not satisfy any requests. + */ + used = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; + if (used - num_bytes >= space_info->total_bytes) + check_overcommit = true; +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + /* + * We use 0 bytes because this space is already reserved, so + * adding the ticket space would be a double count. + */ + if (check_overcommit && + !can_overcommit(fs_info->extent_root, space_info, 0, + flush)) + break; + if (num_bytes >= ticket->bytes) { + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + flush = BTRFS_RESERVE_FLUSH_ALL; + goto again; + } + space_info->bytes_may_use -= num_bytes; + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, num_bytes, 0); + spin_unlock(&space_info->lock); +} + +/* + * This is for newly allocated space that isn't accounted in + * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent + * we use this helper. + */ +static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 num_bytes) +{ + struct reserve_ticket *ticket; + struct list_head *head = &space_info->priority_tickets; + +again: + while (!list_empty(head) && num_bytes) { + ticket = list_first_entry(head, struct reserve_ticket, + list); + if (num_bytes >= ticket->bytes) { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + ticket->bytes, 1); + list_del_init(&ticket->list); + num_bytes -= ticket->bytes; + space_info->bytes_may_use += ticket->bytes; + ticket->bytes = 0; + space_info->tickets_id++; + wake_up(&ticket->wait); + } else { + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + num_bytes, 1); + space_info->bytes_may_use += num_bytes; + ticket->bytes -= num_bytes; + num_bytes = 0; + } + } + + if (num_bytes && head == &space_info->priority_tickets) { + head = &space_info->tickets; + goto again; + } +} + static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes) @@ -5180,18 +5475,15 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info, } spin_unlock(&dest->lock); } - if (num_bytes) { - spin_lock(&space_info->lock); - space_info->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(fs_info, "space_info", - space_info->flags, num_bytes, 0); - spin_unlock(&space_info->lock); - } + if (num_bytes) + space_info_add_old_bytes(fs_info, space_info, + num_bytes); } } -static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, - struct btrfs_block_rsv *dst, u64 num_bytes) +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes, + int update_size) { int ret; @@ -5199,7 +5491,7 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, if (ret) return ret; - block_rsv_add_bytes(dst, num_bytes, 1); + block_rsv_add_bytes(dst, num_bytes, update_size); return 0; } @@ -5306,13 +5598,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, - struct btrfs_block_rsv *dst_rsv, - u64 num_bytes) -{ - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); -} - void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) @@ -5325,48 +5610,21 @@ void btrfs_block_rsv_release(struct btrfs_root *root, num_bytes); } -/* - * helper to calculate size of global block reservation. - * the desired value is sum of space used by extent tree, - * checksum tree and root tree - */ -static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) -{ - struct btrfs_space_info *sinfo; - u64 num_bytes; - u64 meta_used; - u64 data_used; - int csum_size = btrfs_super_csum_size(fs_info->super_copy); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); - spin_lock(&sinfo->lock); - data_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - spin_lock(&sinfo->lock); - if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) - data_used = 0; - meta_used = sinfo->bytes_used; - spin_unlock(&sinfo->lock); - - num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * - csum_size * 2; - num_bytes += div_u64(data_used + meta_used, 50); - - if (num_bytes * 3 > meta_used) - num_bytes = div_u64(meta_used, 3); - - return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); -} - static void update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; struct btrfs_space_info *sinfo = block_rsv->space_info; u64 num_bytes; - num_bytes = calc_global_metadata_size(fs_info); + /* + * The global block rsv is based on the size of the extent tree, the + * checksum tree and the root tree. If the fs is empty we want to set + * it to a minimal amount for safety. + */ + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) + + btrfs_root_used(&fs_info->csum_root->root_item) + + btrfs_root_used(&fs_info->tree_root->root_item); + num_bytes = max_t(u64, num_bytes, SZ_16M); spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); @@ -5464,7 +5722,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, */ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { - struct btrfs_fs_info *fs_info = trans->root->fs_info; + struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->chunk_bytes_reserved) return; @@ -5481,7 +5739,13 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + /* + * We always use trans->block_rsv here as we will have reserved space + * for our orphan when starting the transaction, using get_block_rsv() + * here will sometimes make us choose the wrong block rsv as we could be + * doing a reloc inode for a non refcounted root. + */ + struct btrfs_block_rsv *src_rsv = trans->block_rsv; struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; /* @@ -5492,7 +5756,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); trace_btrfs_space_reservation(root->fs_info, "orphan", btrfs_ino(inode), num_bytes, 1); - return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); + return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); } void btrfs_orphan_release_metadata(struct inode *inode) @@ -5516,7 +5780,7 @@ void btrfs_orphan_release_metadata(struct inode *inode) * common file/directory operations, they change two fs/file trees * and root tree, the number of items that the qgroup reserves is * different with the free space reservation. So we can not use - * the space reseravtion mechanism in start_transaction(). + * the space reservation mechanism in start_transaction(). */ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, @@ -5547,7 +5811,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, BTRFS_RESERVE_FLUSH_ALL); if (ret == -ENOSPC && use_global_rsv) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); + ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1); if (ret && *qgroup_reserved) btrfs_qgroup_free_meta(root, *qgroup_reserved); @@ -5565,7 +5829,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for - * @num_bytes: the number of bytes we're relaseing. + * @num_bytes: the number of bytes we're releasing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of @@ -5591,7 +5855,7 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) drop_inode_space = 1; /* - * If we have more or the same amount of outsanding extents than we have + * If we have more or the same amount of outstanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= @@ -5605,8 +5869,8 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) } /** - * calc_csum_metadata_size - return the amount of metada space that must be - * reserved/free'd for the given bytes. + * calc_csum_metadata_size - return the amount of metadata space that must be + * reserved/freed for the given bytes. * @inode: the inode we're manipulating * @num_bytes: the number of bytes in question * @reserve: 1 if we are reserving space, 0 if we are freeing space @@ -5657,21 +5921,26 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) u64 to_reserve = 0; u64 csum_bytes; unsigned nr_extents = 0; - int extra_reserve = 0; enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; int ret = 0; bool delalloc_lock = true; u64 to_free = 0; unsigned dropped; + bool release_extra = false; /* If we are a free space inode we need to not flush since we will be in * the middle of a transaction commit. We also don't need the delalloc * mutex since we won't race with anybody. We need this mostly to make * lockdep shut its filthy mouth. + * + * If we have a transaction open (can happen if we call truncate_block + * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ if (btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; delalloc_lock = false; + } else if (current->journal_info) { + flush = BTRFS_RESERVE_FLUSH_LIMIT; } if (flush != BTRFS_RESERVE_NO_FLUSH && @@ -5688,24 +5957,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); BTRFS_I(inode)->outstanding_extents += nr_extents; - nr_extents = 0; + nr_extents = 0; if (BTRFS_I(inode)->outstanding_extents > BTRFS_I(inode)->reserved_extents) - nr_extents = BTRFS_I(inode)->outstanding_extents - + nr_extents += BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; - /* - * Add an item to reserve for updating the inode when we complete the - * delalloc io. - */ - if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags)) { - nr_extents++; - extra_reserve = 1; - } - - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* We always want to reserve a slot for updating the inode. */ + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); @@ -5717,17 +5977,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) goto out_fail; } - ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); + ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); - if (extra_reserve) { - set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, - &BTRFS_I(inode)->runtime_flags); - nr_extents--; + if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, + &BTRFS_I(inode)->runtime_flags)) { + to_reserve -= btrfs_calc_trans_metadata_size(root, 1); + release_extra = true; } BTRFS_I(inode)->reserved_extents += nr_extents; spin_unlock(&BTRFS_I(inode)->lock); @@ -5738,8 +5998,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (to_reserve) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_reserve, 1); - block_rsv_add_bytes(block_rsv, to_reserve, 1); - + if (release_extra) + btrfs_block_rsv_release(root, block_rsv, + btrfs_calc_trans_metadata_size(root, + 1)); return 0; out_fail: @@ -5758,7 +6020,7 @@ out_fail: /* * This is tricky, but first we need to figure out how much we - * free'd from any free-ers that occurred during this + * freed from any free-ers that occurred during this * reservation, so we reset ->csum_bytes to the csum_bytes * before we dropped our lock, and then call the free for the * number of bytes that were freed while we were trying our @@ -5780,7 +6042,7 @@ out_fail: /* * Now reset ->csum_bytes to what it should be. If bytes is - * more than to_free then we would have free'd more space had we + * more than to_free then we would have freed more space had we * not had an artificially high ->csum_bytes, so we need to free * the remainder. If bytes is the same or less then we don't * need to do anything, the other free-ers did the correct @@ -5831,7 +6093,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; trace_btrfs_space_reservation(root->fs_info, "delalloc", @@ -5946,7 +6208,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -5971,6 +6233,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); + trace_btrfs_space_reservation(root->fs_info, "pinned", + cache->space_info->flags, + num_bytes, 1); set_extent_dirty(info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); @@ -6045,10 +6310,10 @@ static int pin_down_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); + trace_btrfs_space_reservation(root->fs_info, "pinned", + cache->space_info->flags, num_bytes, 1); set_extent_dirty(root->fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); - if (reserved) - trace_btrfs_reserved_extent_free(root, bytenr, num_bytes); return 0; } @@ -6172,20 +6437,67 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, return 0; } +static void +btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + atomic_inc(&bg->reservations); +} + +void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, + const u64 start) +{ + struct btrfs_block_group_cache *bg; + + bg = btrfs_lookup_block_group(fs_info, start); + ASSERT(bg); + if (atomic_dec_and_test(&bg->reservations)) + wake_up_atomic_t(&bg->reservations); + btrfs_put_block_group(bg); +} + +static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +{ + struct btrfs_space_info *space_info = bg->space_info; + + ASSERT(bg->ro); + + if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA)) + return; + + /* + * Our block group is read only but before we set it to read only, + * some task might have had allocated an extent from it already, but it + * has not yet created a respective ordered extent (and added it to a + * root's list of ordered extents). + * Therefore wait for any task currently allocating extents, since the + * block group's reservations counter is incremented while a read lock + * on the groups' semaphore is held and decremented after releasing + * the read access on that semaphore and creating the ordered extent. + */ + down_write(&space_info->groups_sem); + up_write(&space_info->groups_sem); + + wait_on_atomic_t(&bg->reservations, + btrfs_wait_bg_reservations_atomic_t, + TASK_UNINTERRUPTIBLE); +} + /** - * btrfs_update_reserved_bytes - update the block_group and space info counters + * btrfs_add_reserved_bytes - update the block_group and space info counters * @cache: The cache we are manipulating + * @ram_bytes: The number of bytes of file content, and will be same to + * @num_bytes except for the compress path. * @num_bytes: The number of bytes in question - * @reserve: One of the reservation enums * @delalloc: The blocks are allocated for the delalloc write * - * This is called by the allocator when it reserves space, or by somebody who is - * freeing space that was never actually used on disk. For example if you - * reserve some space for a new leaf in transaction A and before transaction A - * commits you free that leaf, you call this with reserve set to 0 in order to - * clear the reservation. - * - * Metadata reservations should be called with RESERVE_ALLOC so we do the proper + * This is called by the allocator when it reserves space. Metadata + * reservations should be called with RESERVE_ALLOC so we do the proper * ENOSPC accounting. For data we handle the reservation through clearing the * delalloc bits in the io_tree. We have to do this since we could end up * allocating less disk space for the amount of data we have reserved in the @@ -6195,44 +6507,63 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, * make the reservation and return -EAGAIN, otherwise this function always * succeeds. */ -static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve, int delalloc) +static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 ram_bytes, u64 num_bytes, int delalloc) { struct btrfs_space_info *space_info = cache->space_info; int ret = 0; spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (reserve != RESERVE_FREE) { - if (cache->ro) { - ret = -EAGAIN; - } else { - cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; - if (reserve == RESERVE_ALLOC) { - trace_btrfs_space_reservation(cache->fs_info, - "space_info", space_info->flags, - num_bytes, 0); - space_info->bytes_may_use -= num_bytes; - } - - if (delalloc) - cache->delalloc_bytes += num_bytes; - } + if (cache->ro) { + ret = -EAGAIN; } else { - if (cache->ro) - space_info->bytes_readonly += num_bytes; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, + "space_info", space_info->flags, + ram_bytes, 0); + space_info->bytes_may_use -= ram_bytes; if (delalloc) - cache->delalloc_bytes -= num_bytes; + cache->delalloc_bytes += num_bytes; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; } +/** + * btrfs_free_reserved_bytes - update the block_group and space info counters + * @cache: The cache we are manipulating + * @num_bytes: The number of bytes in question + * @delalloc: The blocks are allocated for the delalloc write + * + * This is called by somebody who is freeing space that was never actually used + * on disk. For example if you reserve some space for a new leaf in transaction + * A and before transaction A commits you free that leaf, you call this with + * reserve set to 0 in order to clear the reservation. + */ + +static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int delalloc) +{ + struct btrfs_space_info *space_info = cache->space_info; + int ret = 0; + + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + return ret; +} void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -6274,7 +6605,7 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; - bool ssd = btrfs_test_opt(root, SSD); + bool ssd = btrfs_test_opt(root->fs_info, SSD); *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) @@ -6352,6 +6683,9 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + + trace_btrfs_space_reservation(fs_info, "pinned", + space_info->flags, len, 0); space_info->max_extent_size = 0; percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { @@ -6359,17 +6693,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, readonly = true; } spin_unlock(&cache->lock); - if (!readonly && global_rsv->space_info == space_info) { + if (!readonly && return_free_space && + global_rsv->space_info == space_info) { + u64 to_add = len; + WARN_ON(!return_free_space); spin_lock(&global_rsv->lock); if (!global_rsv->full) { - len = min(len, global_rsv->size - - global_rsv->reserved); - global_rsv->reserved += len; - space_info->bytes_may_use += len; + to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; + space_info->bytes_may_use += to_add; if (global_rsv->reserved >= global_rsv->size) global_rsv->full = 1; + trace_btrfs_space_reservation(fs_info, + "space_info", + space_info->flags, + to_add, 1); + len -= to_add; } spin_unlock(&global_rsv->lock); + /* Add to any tickets we may have */ + if (len) + space_info_add_new_bytes(fs_info, space_info, + len); } spin_unlock(&space_info->lock); } @@ -6404,11 +6750,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, break; } - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); + clear_extent_dirty(unpin, start, end); unpin_extent_range(root, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -6542,7 +6888,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, NULL, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6591,7 +6937,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } extent_slot = path->slots[0]; @@ -6602,10 +6948,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, owner_offset); - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6617,7 +6963,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = convert_extent_item_v0(trans, extent_root, path, owner_objectid, 0); if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6636,7 +6982,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); } if (ret < 0) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -6661,7 +7007,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_err(info, "trying to drop %d refs but we only have %Lu " "for bytenr %Lu", refs_to_drop, refs, bytenr); ret = -EINVAL; - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } refs -= refs_to_drop; @@ -6684,7 +7030,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, iref, refs_to_drop, is_data, &last_ref); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6707,7 +7053,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -6715,7 +7061,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6723,13 +7069,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, ret = add_to_free_space_tree(trans, root->fs_info, bytenr, num_bytes); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -6852,7 +7198,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; @@ -6878,7 +7224,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(fs_info)) return 0; add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); @@ -7025,36 +7371,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, int delalloc) { struct btrfs_block_group_cache *used_bg = NULL; - bool locked = false; -again: + spin_lock(&cluster->refill_lock); - if (locked) { - if (used_bg == cluster->block_group) + while (1) { + used_bg = cluster->block_group; + if (!used_bg) + return NULL; + + if (used_bg == block_group) return used_bg; - up_read(&used_bg->data_rwsem); - btrfs_put_block_group(used_bg); - } + btrfs_get_block_group(used_bg); - used_bg = cluster->block_group; - if (!used_bg) - return NULL; + if (!delalloc) + return used_bg; - if (used_bg == block_group) - return used_bg; + if (down_read_trylock(&used_bg->data_rwsem)) + return used_bg; - btrfs_get_block_group(used_bg); + spin_unlock(&cluster->refill_lock); - if (!delalloc) - return used_bg; + down_read(&used_bg->data_rwsem); - if (down_read_trylock(&used_bg->data_rwsem)) - return used_bg; + spin_lock(&cluster->refill_lock); + if (used_bg == cluster->block_group) + return used_bg; - spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); - locked = true; - goto again; + up_read(&used_bg->data_rwsem); + btrfs_put_block_group(used_bg); + } } static inline void @@ -7078,9 +7423,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache, * the free space extent currently. */ static noinline int find_free_extent(struct btrfs_root *orig_root, - u64 num_bytes, u64 empty_size, - u64 hint_byte, struct btrfs_key *ins, - u64 flags, int delalloc) + u64 ram_bytes, u64 num_bytes, u64 empty_size, + u64 hint_byte, struct btrfs_key *ins, + u64 flags, int delalloc) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -7092,8 +7437,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); - int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? - RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -7425,12 +7768,13 @@ checks: search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, - alloc_type, delalloc); + ret = btrfs_add_reserved_bytes(block_group, ram_bytes, + num_bytes, delalloc); if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } + btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ ins->objectid = search_start; @@ -7471,7 +7815,7 @@ loop: if (loop == LOOP_CACHING_NOWAIT) { /* * We want to skip the LOOP_CACHING_WAIT step if we - * don't have any unached bgs and we've alrelady done a + * don't have any uncached bgs and we've already done a * full search through. */ if (orig_have_caching_bg || !full_search) @@ -7513,8 +7857,7 @@ loop: * can do more things. */ if (ret < 0 && ret != -ENOSPC) - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); else ret = 0; if (!exist) @@ -7568,8 +7911,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly, - (info->full) ? "" : "not "); + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use, (info->full) ? "" : "not "); printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, @@ -7598,7 +7941,7 @@ again: up_read(&info->groups_sem); } -int btrfs_reserve_extent(struct btrfs_root *root, +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) @@ -7610,18 +7953,21 @@ int btrfs_reserve_extent(struct btrfs_root *root, flags = btrfs_get_alloc_profile(root, is_data); again: WARN_ON(num_bytes < root->sectorsize); - ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, - flags, delalloc); - - if (ret == -ENOSPC) { + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, + hint_byte, ins, flags, delalloc); + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(root->fs_info, + ins->objectid); + } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); num_bytes = round_down(num_bytes, root->sectorsize); num_bytes = max(num_bytes, min_alloc_size); + ram_bytes = num_bytes; if (num_bytes == min_alloc_size) final_tried = true; goto again; - } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, flags); @@ -7652,16 +7998,14 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, if (pin) pin_down_extent(root, cache, start, len, 1); else { - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(root->fs_info, DISCARD)) ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); - btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); + btrfs_free_reserved_bytes(cache, len, delalloc); + trace_btrfs_reserved_extent_free(root, start, len); } btrfs_put_block_group(cache); - - trace_btrfs_reserved_extent_free(root, start, len); - return ret; } @@ -7870,10 +8214,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; /* * Mixed block groups will exclude before processing the log so we only - * need to do the exlude dance if this fs isn't mixed. + * need to do the exclude dance if this fs isn't mixed. */ if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { ret = __exclude_logged_extent(root, ins->objectid, ins->offset); @@ -7885,9 +8230,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, if (!block_group) return -EINVAL; - ret = btrfs_update_reserved_bytes(block_group, ins->offset, - RESERVE_ALLOC_NO_ACCOUNT, 0); - BUG_ON(ret); /* logic error */ + space_info = block_group->space_info; + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + space_info->bytes_reserved += ins->offset; + block_group->reserved += ins->offset; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); btrfs_put_block_group(block_group); @@ -7901,8 +8251,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf; buf = btrfs_find_create_tree_block(root, bytenr); - if (!buf) - return ERR_PTR(-ENOMEM); + if (IS_ERR(buf)) + return buf; + btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); @@ -7923,13 +8274,13 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + buf->start + buf->len - 1); } else { buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } - trans->blocks_used++; + trans->dirty = true; /* this returns a buffer locked for blocking */ return buf; } @@ -7961,7 +8312,7 @@ again: goto again; } - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8015,19 +8366,21 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (btrfs_test_is_dummy_root(root)) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (btrfs_is_testing(root->fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } +#endif block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); - ret = btrfs_reserve_extent(root, blocksize, blocksize, + ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize, empty_size, hint, &ins, 0, 0); if (ret) goto out_unuse; @@ -8180,34 +8533,6 @@ reada: wc->reada_slot = slot; } -/* - * These may not be seen by the usual inc/dec ref code so we have to - * add them here. - */ -static int record_one_subtree_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes) -{ - struct btrfs_qgroup_extent_record *qrecord; - struct btrfs_delayed_ref_root *delayed_refs; - - qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS); - if (!qrecord) - return -ENOMEM; - - qrecord->bytenr = bytenr; - qrecord->num_bytes = num_bytes; - qrecord->old_roots = NULL; - - delayed_refs = &trans->transaction->delayed_refs; - spin_lock(&delayed_refs->lock); - if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord)) - kfree(qrecord); - spin_unlock(&delayed_refs->lock); - - return 0; -} - static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) @@ -8241,7 +8566,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = record_one_subtree_extent(trans, root, bytenr, num_bytes); + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + bytenr, num_bytes, GFP_NOFS); if (ret) return ret; } @@ -8390,8 +8716,9 @@ walk_down: btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - ret = record_one_subtree_extent(trans, root, child_bytenr, - root->nodesize); + ret = btrfs_qgroup_insert_dirty_extent(trans, + root->fs_info, child_bytenr, + root->nodesize, GFP_NOFS); if (ret) goto out; } @@ -8544,8 +8871,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); + btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, level - 1); reada = 1; @@ -8985,7 +9313,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, &root->root_key, root_item); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -9012,7 +9340,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_del_root(trans, tree_root, &root->root_key); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -9020,7 +9348,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_find_root(tree_root, &root->root_key, path, NULL, NULL); if (ret < 0) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } else if (ret > 0) { @@ -9058,7 +9386,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err, NULL); + btrfs_handle_fs_error(root->fs_info, err, NULL); return err; } @@ -9317,7 +9645,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) u64 free_bytes = 0; int factor; - /* It's df, we don't care if it's racey */ + /* It's df, we don't care if it's racy */ if (list_empty(&sinfo->ro_bgs)) return 0; @@ -9391,7 +9719,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) int full = 0; int ret = 0; - debug = btrfs_test_opt(root, ENOSPC_DEBUG); + debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG); block_group = btrfs_lookup_block_group(root->fs_info, bytenr); @@ -9547,7 +9875,23 @@ static int find_first_block_group(struct btrfs_root *root, if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = 0; + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &root->fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, found_key.objectid, + found_key.offset); + read_unlock(&em_tree->lock); + if (!em) { + btrfs_err(root->fs_info, + "logical %llu len %llu found bg but no related chunk", + found_key.objectid, found_key.offset); + ret = -ENOENT; + } else { + ret = 0; + } + free_extent_map(em); goto out; } path->slots[0]++; @@ -9584,6 +9928,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) block_group->iref = 0; block_group->inode = NULL; spin_unlock(&block_group->lock); + ASSERT(block_group->io_ctl.inode == NULL); iput(inode); last = block_group->key.objectid + block_group->key.offset; btrfs_put_block_group(block_group); @@ -9641,6 +9986,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) free_excluded_extents(info->extent_root, block_group); btrfs_remove_free_space_cache(block_group); + ASSERT(list_empty(&block_group->dirty_list)); + ASSERT(list_empty(&block_group->io_list)); + ASSERT(list_empty(&block_group->bg_list)); + ASSERT(atomic_read(&block_group->count) == 1); btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); @@ -9663,13 +10012,15 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { - if (WARN_ON(space_info->bytes_pinned > 0 || + + /* + * Do not hide this behind enospc_debug, this is actually + * important and indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_reserved > 0 || - space_info->bytes_may_use > 0)) { - dump_space_info(space_info, 0, 0); - } - } + space_info->bytes_may_use > 0)) + dump_space_info(space_info, 0, 0); list_del(&space_info->list); for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { struct kobject *kobj; @@ -9787,10 +10138,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (btrfs_test_opt(root, SPACE_CACHE) && + if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && btrfs_super_generation(root->fs_info->super_copy) != cache_gen) need_clear = 1; - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(root->fs_info, CLEAR_CACHE)) need_clear = 1; while (1) { @@ -9821,7 +10172,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } @@ -9877,9 +10228,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) goto error; } + trace_btrfs_add_block_group(root->fs_info, cache, 0); ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), - &space_info); + cache->bytes_super, &space_info); if (ret) { btrfs_remove_free_space_cache(cache); spin_lock(&info->block_group_cache_lock); @@ -9892,9 +10244,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) } cache->space_info = space_info; - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); __link_block_group(space_info, cache); @@ -9965,11 +10314,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, ret = btrfs_insert_item(trans, extent_root, &key, &item, sizeof(item)); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); ret = btrfs_finish_chunk_alloc(trans, extent_root, key.objectid, key.offset); if (ret) - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, root->fs_info, block_group); /* already aborted the transaction if it failed. */ next: @@ -9986,7 +10335,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *extent_root; struct btrfs_block_group_cache *cache; - extent_root = root->fs_info->extent_root; btrfs_set_log_full_commit(root->fs_info, trans); @@ -10032,7 +10380,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * assigned to our block group, but don't update its counters just yet. * We want our bg to be added to the rbtree with its ->space_info set. */ - ret = update_space_info(root->fs_info, cache->flags, 0, 0, + ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); @@ -10051,8 +10399,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * Now that our block group has its ->space_info set and is inserted in * the rbtree, update the space info's counters. */ + trace_btrfs_add_block_group(root->fs_info, cache, 1); ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, - &cache->space_info); + cache->bytes_super, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); spin_lock(&root->fs_info->block_group_cache_lock); @@ -10065,16 +10414,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, } update_global_block_rsv(root->fs_info); - spin_lock(&cache->space_info->lock); - cache->space_info->bytes_readonly += cache->bytes_super; - spin_unlock(&cache->space_info->lock); - __link_block_group(cache->space_info, cache); list_add_tail(&cache->bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); - return 0; } @@ -10287,7 +10631,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { WARN_ON(block_group->space_info->total_bytes < block_group->key.offset); WARN_ON(block_group->space_info->bytes_readonly @@ -10526,14 +10870,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_dec_block_group_ro(root, block_group); @@ -10555,7 +10899,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&space_info->lock); /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(root, DISCARD); + trimming = btrfs_test_opt(root->fs_info, DISCARD); /* Implicit trim during transaction commit. */ if (trimming) @@ -10619,21 +10963,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) mixed = 1; flags = BTRFS_BLOCK_GROUP_SYSTEM; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); if (ret) goto out; if (mixed) { flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); } else { flags = BTRFS_BLOCK_GROUP_METADATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); if (ret) goto out; flags = BTRFS_BLOCK_GROUP_DATA; - ret = update_space_info(fs_info, flags, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); } out: return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d247fc0eea19..44fe66b53c8b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -163,13 +163,13 @@ int __init extent_io_init(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer", sizeof(struct extent_buffer), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; @@ -726,14 +726,6 @@ next: start = last_end + 1; if (start <= end && state && !need_resched()) goto hit_next; - goto search_again; - -out: - spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); - - return 0; search_again: if (start > end) @@ -742,6 +734,14 @@ search_again: if (gfpflags_allow_blocking(mask)) cond_resched(); goto again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return 0; + } static void wait_on_state(struct extent_io_tree *tree, @@ -873,8 +873,14 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && gfpflags_allow_blocking(mask)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ prealloc = alloc_extent_state(mask); - BUG_ON(!prealloc); } spin_lock(&tree->lock); @@ -1037,7 +1043,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (gfpflags_allow_blocking(mask)) + cond_resched(); + goto again; out: spin_unlock(&tree->lock); @@ -1046,13 +1058,6 @@ out: return err; -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - goto again; } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -1073,17 +1078,18 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * @bits: the bits to set in this range * @clear_bits: the bits to clear in this range * @cached_state: state that we're going to cache - * @mask: the allocation mask * * This will go through and set bits for the given range. If any states exist * already in this range they are set with the given bit and cleared of the * clear_bits. This is only meant to be used by things that are mergeable, ie * converting from say DELALLOC to DIRTY. This is not meant to be used with * boundary bits like LOCK. + * + * All allocations are done with GFP_NOFS. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask) + struct extent_state **cached_state) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1098,7 +1104,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, btrfs_debug_check_extent_io_range(tree, start, end); again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Best effort, don't worry if extent state allocation fails * here for the first iteration. We might have a cached state @@ -1106,7 +1112,7 @@ again: * extent state allocations are needed. We'll only know this * after locking the tree. */ - prealloc = alloc_extent_state(mask); + prealloc = alloc_extent_state(GFP_NOFS); if (!prealloc && !first_iteration) return -ENOMEM; } @@ -1263,7 +1269,13 @@ hit_next: goto out; } - goto search_again; +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + cond_resched(); + first_iteration = false; + goto again; out: spin_unlock(&tree->lock); @@ -1271,21 +1283,11 @@ out: free_extent_state(prealloc); return err; - -search_again: - if (start > end) - goto out; - spin_unlock(&tree->lock); - if (gfpflags_allow_blocking(mask)) - cond_resched(); - first_iteration = false; - goto again; } /* wrappers around set/clear extent bit */ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCKED yet, as current changeset will @@ -1295,7 +1297,7 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask, + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS, changeset); } @@ -1308,8 +1310,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset) + unsigned bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCKED case, same reason as @@ -1317,7 +1318,7 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ BUG_ON(bits & EXTENT_LOCKED); - return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask, + return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS, changeset); } @@ -1975,13 +1976,13 @@ int free_io_failure(struct inode *inode, struct io_failure_record *rec) set_state_failrec(failure_tree, rec->start, NULL); ret = clear_extent_bits(failure_tree, rec->start, rec->start + rec->len - 1, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret) err = ret; ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, rec->start + rec->len - 1, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret && !err) err = ret; @@ -2024,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, bio->bi_iter.bi_size = 0; map_length = length; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } @@ -2036,14 +2044,17 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, dev = bbio->stripes[mirror_num-1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } bio->bi_bdev = dev->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { + if (btrfsic_submit_bio_wait(bio)) { /* try to remap that extent elsewhere? */ + btrfs_bio_counter_dec(fs_info); bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; @@ -2053,6 +2064,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); + btrfs_bio_counter_dec(fs_info); bio_put(bio); return 0; } @@ -2232,13 +2244,12 @@ int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, /* set the bits in the private failure tree */ ret = set_extent_bits(failure_tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY); if (ret >= 0) ret = set_state_failrec(failure_tree, start, failrec); /* set the bits in the inode's tree */ if (ret >= 0) - ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, - GFP_NOFS); + ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED); if (ret < 0) { kfree(failrec); return ret; @@ -2376,7 +2387,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, int read_mode; int ret; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) @@ -2402,12 +2413,12 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, free_io_failure(inode, failrec); return -EIO; } + bio_set_op_attrs(bio, REQ_OP_READ, read_mode); pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = tree->ops->submit_bio_hook(inode, read_mode, bio, - failrec->this_mirror, + ret = tree->ops->submit_bio_hook(inode, bio, failrec->this_mirror, failrec->bio_flags, 0); if (ret) { free_io_failure(inode, failrec); @@ -2686,12 +2697,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; - -#ifdef CONFIG_BLK_CGROUP - /* FIXME, put this into bio_clone_bioset */ - if (bio->bi_css) - bio_associate_blkcg(new, bio->bi_css); -#endif } return new; } @@ -2713,8 +2718,8 @@ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) } -static int __must_check submit_one_bio(int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) +static int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; @@ -2725,33 +2730,31 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, start = page_offset(page) + bvec->bv_offset; bio->bi_private = NULL; - bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) - ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + ret = tree->ops->submit_bio_hook(page->mapping->host, bio, mirror_num, bio_flags, start); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return ret; } -static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, +static int merge_bio(struct extent_io_tree *tree, struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { int ret = 0; if (tree->ops && tree->ops->merge_bio_hook) - ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, + ret = tree->ops->merge_bio_hook(page, offset, size, bio, bio_flags); - BUG_ON(ret < 0); return ret; } -static int submit_extent_page(int rw, struct extent_io_tree *tree, +static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, struct writeback_control *wbc, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2779,10 +2782,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (prev_bio_flags != bio_flags || !contig || force_bio_submit || - merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || + merge_bio(tree, page, offset, page_size, bio, bio_flags) || bio_add_page(bio, page, page_size, offset) < page_size) { - ret = submit_one_bio(rw, bio, mirror_num, - prev_bio_flags); + ret = submit_one_bio(bio, mirror_num, prev_bio_flags); if (ret < 0) { *bio_ret = NULL; return ret; @@ -2803,6 +2805,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; + bio_set_op_attrs(bio, op, op_flags); if (wbc) { wbc_init_bio(wbc, bio); wbc_account_io(wbc, page, page_size); @@ -2811,7 +2814,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, if (bio_ret) *bio_ret = bio; else - ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; } @@ -2869,13 +2872,14 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * into the tree that are removed when the IO is done (by the end_io * handlers) * XXX JDM: This needs looking at to ensure proper page locking + * return 0 on success, otherwise return error */ static int __do_readpage(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; @@ -2890,7 +2894,7 @@ static int __do_readpage(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; - int ret; + int ret = 0; int nr = 0; size_t pg_offset = 0; size_t iosize; @@ -3058,8 +3062,8 @@ static int __do_readpage(struct extent_io_tree *tree, } pnr -= page->index; - ret = submit_extent_page(rw, tree, NULL, page, - sector, disk_io_size, pg_offset, + ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, + page, sector, disk_io_size, pg_offset, bdev, bio, pnr, end_bio_extent_readpage, mirror_num, *bio_flags, @@ -3071,6 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree, } else { SetPageError(page); unlock_extent(tree, cur, cur + iosize - 1); + goto out; } cur = cur + iosize; pg_offset += iosize; @@ -3081,7 +3086,7 @@ out: SetPageUptodate(page); unlock_page(page); } - return 0; + return ret; } static inline void __do_contiguous_readpages(struct extent_io_tree *tree, @@ -3090,7 +3095,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, u64 *prev_em_start) { struct inode *inode; @@ -3111,7 +3116,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree, for (index = 0; index < nr_pages; index++) { __do_readpage(tree, pages[index], get_extent, em_cached, bio, - mirror_num, bio_flags, rw, prev_em_start); + mirror_num, bio_flags, 0, prev_em_start); put_page(pages[index]); } } @@ -3121,7 +3126,7 @@ static void __extent_readpages(struct extent_io_tree *tree, int nr_pages, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw, + unsigned long *bio_flags, u64 *prev_em_start) { u64 start = 0; @@ -3143,7 +3148,7 @@ static void __extent_readpages(struct extent_io_tree *tree, index - first_index, start, end, get_extent, em_cached, bio, mirror_num, bio_flags, - rw, prev_em_start); + prev_em_start); start = page_start; end = start + PAGE_SIZE - 1; first_index = index; @@ -3154,7 +3159,7 @@ static void __extent_readpages(struct extent_io_tree *tree, __do_contiguous_readpages(tree, &pages[first_index], index - first_index, start, end, get_extent, em_cached, bio, - mirror_num, bio_flags, rw, + mirror_num, bio_flags, prev_em_start); } @@ -3162,7 +3167,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int rw) + unsigned long *bio_flags, int read_flags) { struct inode *inode = page->mapping->host; struct btrfs_ordered_extent *ordered; @@ -3182,7 +3187,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, } ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, - bio_flags, rw, NULL); + bio_flags, read_flags, NULL); return ret; } @@ -3194,20 +3199,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, int ret; ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, - &bio_flags, READ); + &bio_flags, 0); if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; } -static noinline void update_nr_written(struct page *page, - struct writeback_control *wbc, - unsigned long nr_written) +static void update_nr_written(struct page *page, struct writeback_control *wbc, + unsigned long nr_written) { wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; } /* @@ -3368,6 +3369,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, while (cur <= end) { u64 em_end; + unsigned long max_nr; + if (cur >= i_size) { if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, @@ -3423,32 +3426,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, continue; } - if (tree->ops && tree->ops->writepage_io_hook) { - ret = tree->ops->writepage_io_hook(page, cur, - cur + iosize - 1); - } else { - ret = 0; + max_nr = (i_size >> PAGE_SHIFT) + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + btrfs_err(BTRFS_I(inode)->root->fs_info, + "page %lu not writeback, cur %llu end %llu", + page->index, cur, end); } - if (ret) { - SetPageError(page); - } else { - unsigned long max_nr = (i_size >> PAGE_SHIFT) + 1; - set_range_writeback(tree, cur, cur + iosize - 1); - if (!PageWriteback(page)) { - btrfs_err(BTRFS_I(inode)->root->fs_info, - "page %lu not writeback, cur %llu end %llu", - page->index, cur, end); - } + ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + page, sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0, false); + if (ret) + SetPageError(page); - ret = submit_extent_page(write_flags, tree, wbc, page, - sector, iosize, pg_offset, - bdev, &epd->bio, max_nr, - end_bio_extent_writepage, - 0, 0, 0, false); - if (ret) - SetPageError(page); - } cur = cur + iosize; pg_offset += iosize; nr++; @@ -3481,13 +3475,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, size_t pg_offset = 0; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - int write_flags; + int write_flags = 0; unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) write_flags = WRITE_SYNC; - else - write_flags = WRITE; trace___extent_writepage(page, inode, wbc); @@ -3731,7 +3723,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, u64 offset = eb->start; unsigned long i, num_pages; unsigned long bio_flags = 0; - int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; + int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); @@ -3745,9 +3737,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, - PAGE_SIZE, 0, bdev, &epd->bio, - -1, end_bio_extent_buffer_writepage, + ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + p, offset >> 9, PAGE_SIZE, 0, bdev, + &epd->bio, -1, + end_bio_extent_buffer_writepage, 0, epd->bio_flags, bio_flags, false); epd->bio_flags = bio_flags; if (ret) { @@ -3920,12 +3913,13 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, struct inode *inode = mapping->host; int ret = 0; int done = 0; - int err = 0; int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; pgoff_t end; /* Inclusive */ + pgoff_t done_index; + int range_whole = 0; int scanned = 0; int tag; @@ -3948,6 +3942,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -3957,6 +3953,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); + done_index = index; while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { @@ -3966,6 +3963,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + done_index = page->index; /* * At this point we hold neither mapping->tree_lock nor * lock on the page itself: the page may be truncated or @@ -4007,8 +4005,20 @@ retry: unlock_page(page); ret = 0; } - if (!err && ret < 0) - err = ret; + if (ret < 0) { + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + done_index = page->index + 1; + done = 1; + break; + } /* * the filesystem may choose to bump up nr_to_write. @@ -4020,7 +4030,7 @@ retry: pagevec_release(&pvec); cond_resched(); } - if (!scanned && !done && !err) { + if (!scanned && !done) { /* * We hit the last page and there is more work to be done: wrap * back to the start of the file @@ -4029,20 +4039,23 @@ retry: index = 0; goto retry; } + + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) + mapping->writeback_index = done_index; + btrfs_add_delayed_iput(inode); - return err; + return ret; } static void flush_epd_write_bio(struct extent_page_data *epd) { if (epd->bio) { - int rw = WRITE; int ret; - if (epd->sync_io) - rw = WRITE_SYNC; + bio_set_op_attrs(epd->bio, REQ_OP_WRITE, + epd->sync_io ? WRITE_SYNC : 0); - ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags); + ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ epd->bio = NULL; } @@ -4160,7 +4173,8 @@ int extent_readpages(struct extent_io_tree *tree, prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, - page->index, GFP_NOFS)) { + page->index, + readahead_gfp_mask(mapping))) { put_page(page); continue; } @@ -4169,19 +4183,19 @@ int extent_readpages(struct extent_io_tree *tree, if (nr < ARRAY_SIZE(pagepool)) continue; __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ, &prev_em_start); + &bio, 0, &bio_flags, &prev_em_start); nr = 0; } if (nr) __extent_readpages(tree, pagepool, nr, get_extent, &em_cached, - &bio, 0, &bio_flags, READ, &prev_em_start); + &bio, 0, &bio_flags, &prev_em_start); if (em_cached) free_extent_map(em_cached); BUG_ON(!list_empty(pages)); if (bio) - return submit_one_bio(READ, bio, 0, bio_flags); + return submit_one_bio(bio, 0, bio_flags); return 0; } @@ -4379,8 +4393,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret < 0) { btrfs_free_path(path); return ret; + } else { + WARN_ON(!ret); + if (ret == 1) + ret = 0; } - WARN_ON(!ret); + path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = found_key.type; @@ -4591,7 +4609,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) if (mapped) spin_unlock(&page->mapping->private_lock); - /* One for when we alloced the page */ + /* One for when we allocated the page */ put_page(page); } while (index != 0); } @@ -4704,16 +4722,16 @@ err: } struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { unsigned long len; if (!fs_info) { /* * Called only from tests that don't always have a fs_info - * available, but we know that nodesize is 4096 + * available */ - len = 4096; + len = nodesize; } else { len = fs_info->tree_root->nodesize; } @@ -4809,7 +4827,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) + u64 start, u32 nodesize) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4817,12 +4835,12 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(fs_info, start); + eb = alloc_dummy_extent_buffer(fs_info, start, nodesize); if (!eb) return NULL; eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + ret = radix_tree_preload(GFP_NOFS); if (ret) goto free_eb; spin_lock(&fs_info->buffer_lock); @@ -4868,18 +4886,25 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int uptodate = 1; int ret; + if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return ERR_PTR(-EINVAL); + } + eb = find_extent_buffer(fs_info, start); if (eb) return eb; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); - if (!p) + if (!p) { + exists = ERR_PTR(-ENOMEM); goto free_eb; + } spin_lock(&mapping->private_lock); if (PagePrivate(p)) { @@ -4923,9 +4948,11 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); again: - ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (ret) + ret = radix_tree_preload(GFP_NOFS); + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, @@ -5198,22 +5225,38 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = eb->pages[i]; + if (!PageUptodate(page)) { + if (ret) { + atomic_dec(&eb->io_pages); + unlock_page(page); + continue; + } + ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, &bio_flags, - READ | REQ_META); - if (err) + REQ_META); + if (err) { ret = err; + /* + * We use &bio in above __extent_read_full_page, + * so we ensure that if it returns error, the + * current page fails to add itself to bio and + * it's been unlocked. + * + * We must dec io_pages by ourselves. + */ + atomic_dec(&eb->io_pages); + } } else { unlock_page(page); } } if (bio) { - err = submit_one_bio(READ | REQ_META, bio, mirror_num, - bio_flags); + err = submit_one_bio(bio, mirror_num, bio_flags); if (err) return err; } @@ -5309,6 +5352,11 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, return ret; } +/* + * return 0 if the item is found within a page. + * return 1 if the item spans two pages. + * return -EINVAL otherwise. + */ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long min_len, char **map, unsigned long *map_start, @@ -5323,7 +5371,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, PAGE_SHIFT; if (i != end_i) - return -EINVAL; + return 1; if (i == 0) { offset = start_offset; @@ -5751,7 +5799,7 @@ int try_release_extent_buffer(struct page *page) struct extent_buffer *eb; /* - * We need to make sure noboody is attaching this page to an eb right + * We need to make sure nobody is attaching this page to an eb right * now. */ spin_lock(&page->mapping->private_lock); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b5e0ade90e88..28cd88fccc7e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -20,6 +20,7 @@ #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) +#define EXTENT_CLEAR_DATA_RESV (1U << 17) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -63,17 +64,16 @@ struct btrfs_root; struct btrfs_io_bio; struct io_failure_record; -typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, u64 bio_offset); +typedef int (extent_submit_bio_hook_t)(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, + u64 bio_offset); struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); - int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; - int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, + int (*merge_bio_hook)(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); @@ -221,8 +221,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int filled, struct extent_state *cached_state); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); @@ -241,27 +240,27 @@ static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { int wake = 0; if (bits & EXTENT_LOCKED) wake = 1; - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, + GFP_NOFS); } int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask, - struct extent_changeset *changeset); + unsigned bits, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, unsigned bits, gfp_t mask) + u64 end, unsigned bits) { - return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); + return set_extent_bit(tree, start, end, bits, NULL, NULL, GFP_NOFS); } static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -279,37 +278,38 @@ static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, } static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); } int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, - struct extent_state **cached_state, gfp_t mask); + struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) + u64 end, struct extent_state **cached_state) { return set_extent_bit(tree, start, end, EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, mask); + NULL, cached_state, GFP_NOFS); } static inline int set_extent_new(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) + u64 end) { - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, + GFP_NOFS); } static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, @@ -349,7 +349,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -469,5 +469,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start, u32 nodesize); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 318b048eb254..26f9ac719d20 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -13,7 +13,7 @@ int __init extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void) /** * free_extent_map - drop reference count of an extent_map - * @em: extent map being releasead + * @em: extent map being released * * Drops the reference out on @em by one and free the structure * if the reference count hits zero. diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7a7d6e253cfc..d0d571c47d33 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -248,9 +248,9 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, offset + root->sectorsize - 1, - EXTENT_NODATASUM, GFP_NOFS); + EXTENT_NODATASUM); } else { - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_info_rl(BTRFS_I(inode)->root->fs_info, "no csum found for inode %llu start %llu", btrfs_ino(inode), offset); } @@ -699,7 +699,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, */ ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ea9f10bb089c..fea31a4a6e36 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -132,7 +132,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode, static inline int __need_auto_defrag(struct btrfs_root *root) { - if (!btrfs_test_opt(root, AUTO_DEFRAG)) + if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG)) return 0; if (btrfs_fs_closing(root->fs_info)) @@ -950,7 +950,7 @@ delete_extent_item: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -974,7 +974,7 @@ delete_extent_item: path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } leaf = path->nodes[0]; @@ -1190,7 +1190,7 @@ again: goto again; } if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -1278,7 +1278,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } @@ -1534,30 +1534,30 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, reserve_bytes = round_up(write_bytes + sector_offset, root->sectorsize); - if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC)) && - check_can_nocow(inode, pos, &write_bytes) > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ - only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_SIZE); - reserve_bytes = round_up(write_bytes + sector_offset, - root->sectorsize); - goto reserve_metadata; - } - ret = btrfs_check_data_free_space(inode, pos, write_bytes); - if (ret < 0) - break; + if (ret < 0) { + if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) && + check_can_nocow(inode, pos, &write_bytes) > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ + only_release_metadata = true; + /* + * our prealloc extent may be smaller than + * write_bytes, so scale down. + */ + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_SIZE); + reserve_bytes = round_up(write_bytes + + sector_offset, + root->sectorsize); + } else { + break; + } + } -reserve_metadata: ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); if (ret) { if (!only_release_metadata) @@ -1596,6 +1596,13 @@ again: copied = btrfs_copy_from_user(pos, write_bytes, pages, i); + num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + reserve_bytes); + dirty_sectors = round_up(copied + sector_offset, + root->sectorsize); + dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, + dirty_sectors); + /* * if we have trouble faulting in the pages, fall * back to one page at a time @@ -1605,6 +1612,7 @@ again: if (copied == 0) { force_page_uptodate = true; + dirty_sectors = 0; dirty_pages = 0; } else { force_page_uptodate = false; @@ -1615,20 +1623,17 @@ again: /* * If we had a short copy we need to release the excess delaloc * bytes we reserved. We need to increment outstanding_extents - * because btrfs_delalloc_release_space will decrement it, but + * because btrfs_delalloc_release_space and + * btrfs_delalloc_release_metadata will decrement it, but * we still have an outstanding extent for the chunk we actually * managed to copy. */ - num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - root->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - dirty_sectors); - if (num_sectors > dirty_sectors) { - release_bytes = (write_bytes - copied) - & ~((u64)root->sectorsize - 1); + + /* release everything except the sectors we dirtied */ + release_bytes -= dirty_sectors << + root->fs_info->sb->s_blocksize_bits; + if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; @@ -1696,7 +1701,9 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, pos, release_bytes); + btrfs_delalloc_release_space(inode, + round_down(pos, root->sectorsize), + release_bytes); } } @@ -2020,12 +2027,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed)) { /* - * We'v had everything committed since the last time we were + * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * An ordered extent might have started before and completed + * already with io errors, in which case the inode was not + * updated and we end up here. So check the inode's mapping + * flags for any errors that might have happened while doing + * writeback of file data. + */ + ret = btrfs_inode_check_errors(inode); inode_unlock(inode); goto out; } @@ -2055,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - btrfs_init_log_ctx(&ctx); + btrfs_init_log_ctx(&ctx, inode); ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { @@ -2368,7 +2383,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) /* Check the aligned pages after the first unaligned page, * if offset != orig_start, which means the first unaligned page - * including serveral following pages are already in holes, + * including several following pages are already in holes, * the extra check can be skipped */ if (offset == orig_start) { /* after truncate page, check hole again */ @@ -2470,7 +2485,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, - min_size); + min_size, 0); BUG_ON(ret); trans->block_rsv = rsv; @@ -2513,7 +2528,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, - rsv, min_size); + rsv, min_size, 0); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; @@ -2660,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode, alloc_start = round_down(offset, blocksize); alloc_end = round_up(offset + len, blocksize); + cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2752,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode, /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); - cur_offset = alloc_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); @@ -2779,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte - cur_offset); if (ret < 0) break; + } else { + /* + * Do not need to reserve unwritten extent for this + * range, free reserved data space first, otherwise + * it'll result in false ENOSPC error. + */ + btrfs_free_reserved_data_space(inode, cur_offset, + last_byte - cur_offset); } free_extent_map(em); cur_offset = last_byte; @@ -2796,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode, range->start, range->len, 1 << inode->i_blkbits, offset + len, &alloc_hint); + else + btrfs_free_reserved_data_space(inode, range->start, + range->len); list_del(&range->list); kfree(range); } @@ -2830,18 +2856,11 @@ out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_KERNEL); out: - /* - * As we waited the extent range, the data_rsv_map must be empty - * in the range, as written data range will be released from it. - * And for prealloacted extent, it will also be released when - * its metadata is written. - * So this is completely used as cleanup. - */ - btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); inode_unlock(inode); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_start, - alloc_end - alloc_start); + if (ret != 0) + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - cur_offset); return ret; } @@ -2952,7 +2971,7 @@ const struct file_operations btrfs_file_operations = { .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .copy_file_range = btrfs_copy_file_range, .clone_file_range = btrfs_clone_file_range, @@ -2968,7 +2987,7 @@ int btrfs_auto_defrag_init(void) { btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", sizeof(struct inode_defrag), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_inode_defrag_cachep) return -ENOMEM; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5e6062c26129..d571bd2b697b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,7 +29,7 @@ #include "inode-map.h" #include "volumes.h" -#define BITS_PER_BITMAP (PAGE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { @@ -280,7 +280,7 @@ fail: if (locked) mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1415,11 +1415,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; - u32 bytes_per_bitmap; + u64 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; - bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; @@ -1638,10 +1638,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); + u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u64 max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); - max_bitmaps = max_t(u32, max_bitmaps, 1); + max_bitmaps = max_t(u64, max_bitmaps, 1); ASSERT(ctl->total_bitmaps <= max_bitmaps); @@ -1660,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as * we add more bitmaps. */ - bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_SIZE; + bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit; if (bitmap_bytes >= max_bytes) { ctl->extents_thresh = 0; @@ -1983,7 +1983,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want - * to reserve them to larger extents, however if we have plent + * to reserve them to larger extents, however if we have plenty * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ @@ -3026,7 +3026,7 @@ int btrfs_find_space_cluster(struct btrfs_root *root, * For metadata, allow allocates with smaller extents. For * data, keep it dense. */ - if (btrfs_test_opt(root, SSD_SPREAD)) { + if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) { cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; @@ -3470,7 +3470,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) int ret = 0; u64 root_gen = btrfs_root_generation(&root->root_item); - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; /* @@ -3514,7 +3514,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_io_ctl io_ctl; bool release_metadata = true; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; memset(&io_ctl, 0, sizeof(io_ctl)); @@ -3662,7 +3662,7 @@ have_info: if (tmp->offset + tmp->bytes < offset) break; if (offset + bytes < tmp->offset) { - n = rb_prev(&info->offset_index); + n = rb_prev(&tmp->offset_index); continue; } info = tmp; @@ -3676,7 +3676,7 @@ have_info: if (offset + bytes < tmp->offset) break; if (tmp->offset + tmp->bytes < offset) { - n = rb_next(&info->offset_index); + n = rb_next(&tmp->offset_index); continue; } info = tmp; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 33178c490ace..3af651c2bbc7 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -123,7 +123,7 @@ int btrfs_return_cluster_to_free_space( int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); -/* Support functions for runnint our sanity tests */ +/* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int test_add_free_space_entry(struct btrfs_block_group_cache *cache, u64 offset, u64 bytes, bool bitmap); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 53dbeaf6ce94..87e7e3d3e676 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -305,7 +305,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -454,7 +454,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, out: kvfree(bitmap); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -851,7 +851,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1047,7 +1047,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1193,7 +1193,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) abort: fs_info->creating_free_space_tree = 0; - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1280,7 +1280,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return 0; abort: - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, tree_root); return ret; } @@ -1333,7 +1333,7 @@ out: btrfs_free_path(path); mutex_unlock(&block_group->free_space_lock); if (ret) - btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -1410,7 +1410,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index aae520b2aee5..a97fdc156a03 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -24,6 +24,11 @@ int __init btrfs_hash_init(void) return PTR_ERR_OR_ZERO(tfm); } +const char* btrfs_crc32c_impl(void) +{ + return crypto_tfm_alg_driver_name(crypto_shash_tfm(tfm)); +} + void btrfs_hash_exit(void) { crypto_free_shash(tfm); diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 118a2316e5d3..c3a2ec554361 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -22,6 +22,7 @@ int __init btrfs_hash_init(void); void btrfs_hash_exit(void); +const char* btrfs_crc32c_impl(void); u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index be4d22a5022f..b8acc07ac6c2 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) { - btrfs_std_error(root->fs_info, -ENOENT, NULL); + btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 70107f7c9307..359ee861b5a4 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -38,7 +38,7 @@ static int caching_kthread(void *data) int slot; int ret; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -141,7 +141,7 @@ static void start_caching(struct btrfs_root *root) int ret; u64 objectid; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; spin_lock(&root->ino_cache_lock); @@ -185,7 +185,7 @@ static void start_caching(struct btrfs_root *root) int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) { - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return btrfs_find_free_objectid(root, objectid); again: @@ -211,7 +211,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; again: if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { @@ -251,7 +251,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) struct rb_node *n; u64 count; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return; while (1) { @@ -412,7 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (btrfs_root_refs(&root->root_item) == 0) return 0; - if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -458,7 +458,7 @@ again: BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } @@ -466,7 +466,7 @@ again: ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) { if (ret != -ENOSPC) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_put; } } @@ -495,10 +495,9 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_space(inode, 0, prealloc); + btrfs_delalloc_release_metadata(inode, prealloc); goto out_put; } - btrfs_free_reserved_data_space(inode, 0, prealloc); ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index da5799c56160..e6811c42e41e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -60,6 +60,7 @@ #include "hash.h" #include "props.h" #include "qgroup.h" +#include "dedupe.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -105,8 +106,9 @@ static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash); static struct extent_map *create_pinned_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, @@ -294,7 +296,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, start, aligned_end, NULL, 1, 1, extent_item_size, &extent_inserted); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -305,7 +307,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, inline_len, compressed_size, compress_type, compressed_pages); if (ret && ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; @@ -374,12 +376,12 @@ static inline int inode_need_compress(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; /* force compress */ - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) return 1; /* bad compression ratios */ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) return 0; - if (btrfs_test_opt(root, COMPRESS) || + if (btrfs_test_opt(root->fs_info, COMPRESS) || BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || BTRFS_I(inode)->force_compress) return 1; @@ -455,7 +457,7 @@ again: /* * skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it dosen't save disk space at all. + * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) @@ -564,6 +566,8 @@ cont: PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); goto free_pages_out; } } @@ -585,9 +589,27 @@ cont: will_compress = 0; } else { num_bytes = total_in; + *num_added += 1; + + /* + * The async work queues will take care of doing actual + * allocation on disk for these compressed pages, and + * will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret, + compress_type); + + if (start + num_bytes < end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + return; } } - if (!will_compress && pages) { + if (pages) { /* * the compression code ran but failed to make things smaller, * free any pages it allocated and our page pointer array @@ -602,48 +624,28 @@ cont: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(root, FORCE_COMPRESS) && + if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) && !(BTRFS_I(inode)->force_compress)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } } - if (will_compress) { - *num_added += 1; - - /* the async work queues will take care of doing actual - * allocation on disk for these compressed pages, - * and will submit them to the elevator. - */ - add_async_extent(async_cow, start, num_bytes, - total_compressed, pages, nr_pages_ret, - compress_type); - - if (start + num_bytes < end) { - start += num_bytes; - pages = NULL; - cond_resched(); - goto again; - } - } else { cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in - * the file we've been given so far. redirty the locked - * page if it corresponds to our extent and set things up - * for the async work queue to run cow_file_range to do - * the normal delalloc dance - */ - if (page_offset(locked_page) >= start && - page_offset(locked_page) <= end) { - __set_page_dirty_nobuffers(locked_page); - /* unlocked later on in the async handlers */ - } - if (redirty) - extent_range_redirty_for_io(inode, start, end); - add_async_extent(async_cow, start, end - start + 1, - 0, NULL, 0, BTRFS_COMPRESS_NONE); - *num_added += 1; - } + /* + * No compression, but we still need to write the pages in the file + * we've been given so far. redirty the locked page if it corresponds + * to our extent and set things up for the async work queue to run + * cow_file_range to do the normal delalloc dance. + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + + if (redirty) + extent_range_redirty_for_io(inode, start, end); + add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); + *num_added += 1; return; @@ -712,7 +714,10 @@ retry: async_extent->start, async_extent->start + async_extent->ram_size - 1, - &page_started, &nr_written, 0); + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0, + NULL); /* JDM XXX */ @@ -739,7 +744,7 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); - ret = btrfs_reserve_extent(root, + ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); @@ -824,6 +829,7 @@ retry: async_extent->ram_size - 1, 0); goto out_free_reserve; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); /* * clear dirty, set writeback and unlock the pages. @@ -861,6 +867,7 @@ retry: } return; out_free_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, @@ -923,9 +930,9 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, */ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, - int unlock) + u64 start, u64 end, u64 delalloc_end, + int *page_started, unsigned long *nr_written, + int unlock, struct btrfs_dedupe_hash *hash) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 alloc_hint = 0; @@ -964,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); - + btrfs_free_reserved_data_space_noquota(inode, start, + end - start + 1); *nr_written = *nr_written + (end - start + PAGE_SIZE) / PAGE_SIZE; *page_started = 1; @@ -984,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode, unsigned long op; cur_alloc_size = disk_num_bytes; - ret = btrfs_reserve_extent(root, cur_alloc_size, + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) @@ -1038,6 +1046,8 @@ static noinline int cow_file_range(struct inode *inode, goto out_drop_extent_cache; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (disk_num_bytes < cur_alloc_size) break; @@ -1066,6 +1076,7 @@ out: out_drop_extent_cache: btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, locked_page, @@ -1151,7 +1162,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->start = start; if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(root, FORCE_COMPRESS)) + !btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) cur_end = end; else cur_end = min(end, start + SZ_512K - 1); @@ -1377,6 +1388,9 @@ next_slot: */ if (csum_exist_in_range(root, disk_bytenr, num_bytes)) goto out_check; + if (!btrfs_inc_nocow_writers(root->fs_info, + disk_bytenr)) + goto out_check; nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + @@ -1391,6 +1405,9 @@ out_check: path->slots[0]++; if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto next_slot; } if (!nocow) { @@ -1407,10 +1424,14 @@ out_check: if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, - page_started, nr_written, 1); + end, page_started, nr_written, 1, + NULL); if (ret) { if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, + disk_bytenr); goto error; } cow_start = (u64)-1; @@ -1453,6 +1474,8 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); + if (nocow) + btrfs_dec_nocow_writers(root->fs_info, disk_bytenr); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == @@ -1469,8 +1492,10 @@ out_check: extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_LOCKED | - EXTENT_DELALLOC, PAGE_UNLOCK | - PAGE_SET_PRIVATE2); + EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_PRIVATE2); + if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; @@ -1485,8 +1510,8 @@ out_check: } if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, cow_start, end, end, + page_started, nr_written, 1, NULL); if (ret) goto error; } @@ -1545,8 +1570,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_need_compress(inode)) { - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + ret = cow_file_range(inode, locked_page, start, end, end, + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); @@ -1724,7 +1749,7 @@ static void btrfs_set_bit_hook(struct inode *inode, } /* For sanity tests */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; __percpu_counter_add(&root->fs_info->delalloc_bytes, len, @@ -1783,11 +1808,13 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_delalloc_release_metadata(inode, len); /* For sanity tests. */ - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE)) + && do_list && !(state->state & EXTENT_NORESERVE) + && (*bits & (EXTENT_DO_ACCOUNTING | + EXTENT_CLEAR_DATA_RESV))) btrfs_free_reserved_data_space_noquota(inode, state->start, len); @@ -1806,8 +1833,12 @@ static void btrfs_clear_bit_hook(struct inode *inode, /* * extent_io.c merge_bio_hook, this must check the chunk tree to make sure * we don't create bios that span stripes or chunks + * + * return 1 if page cannot be merged to bio + * return 0 if page can be merged to bio + * return error otherwise */ -int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { @@ -1822,10 +1853,10 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, rw, logical, + ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, &map_length, NULL, 0); - /* Will always return 0 with map_multi == NULL */ - BUG_ON(ret < 0); + if (ret < 0) + return ret; if (map_length < length + size) return 1; return 0; @@ -1839,9 +1870,8 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_start(struct inode *inode, int rw, - struct bio *bio, int mirror_num, - unsigned long bio_flags, +static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1860,14 +1890,14 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, +static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = btrfs_map_bio(root, rw, bio, mirror_num, 1); + ret = btrfs_map_bio(root, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -1879,7 +1909,7 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * extent_io.c submission hook. This does the right thing for csum calculation * on write, or reading the csums from the tree before a read */ -static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, +static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { @@ -1894,7 +1924,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (btrfs_is_free_space_inode(inode)) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) != REQ_OP_WRITE) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); if (ret) goto out; @@ -1916,7 +1946,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, + inode, bio, mirror_num, bio_flags, bio_offset, __btrfs_submit_bio_start, __btrfs_submit_bio_done); @@ -1928,7 +1958,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, } mapit: - ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, bio, mirror_num, 0); out: if (ret < 0) { @@ -1962,7 +1992,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, { WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state, GFP_NOFS); + cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2579,7 +2609,7 @@ again: ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2606,7 +2636,7 @@ again: backref->root_id, backref->inum, new->file_pos); /* start - extent_offset */ if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_free_path; } @@ -2875,7 +2905,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2935,7 +2965,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->len, trans->transid); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } @@ -2945,7 +2975,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_unlock; } ret = 0; @@ -3103,8 +3133,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, - GFP_NOFS); + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); return 0; } @@ -3190,7 +3219,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); @@ -3256,7 +3285,16 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* grab metadata reservation from transaction handle */ if (reserve) { ret = btrfs_orphan_reserve_metadata(trans, inode); - BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */ + ASSERT(!ret); + if (ret) { + atomic_dec(&root->orphan_inodes); + clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, + &BTRFS_I(inode)->runtime_flags); + if (insert) + clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, + &BTRFS_I(inode)->runtime_flags); + return ret; + } } /* insert an orphan item to track this unlinked/truncated file */ @@ -3272,7 +3310,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (ret != -EEXIST) { clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -3284,7 +3322,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, root->root_key.objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } } @@ -3404,10 +3442,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); ret = PTR_ERR_OR_ZERO(inode); - if (ret && ret != -ESTALE) + if (ret && ret != -ENOENT) goto out; - if (ret == -ESTALE && root == root->fs_info->tree_root) { + if (ret == -ENOENT && root == root->fs_info->tree_root) { struct btrfs_root *dead_root; struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; @@ -3443,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * Inode is already gone but the orphan item is still there, * kill the orphan item. */ - if (ret == -ESTALE) { + if (ret == -ENOENT) { trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3602,7 +3640,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* * read an inode from the btree into the in-memory inode */ -static void btrfs_read_locked_inode(struct inode *inode) +static int btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -3621,14 +3659,19 @@ static void btrfs_read_locked_inode(struct inode *inode) filled = true; path = btrfs_alloc_path(); - if (!path) + if (!path) { + ret = -ENOMEM; goto make_bad; + } memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); - if (ret) + if (ret) { + if (ret > 0) + ret = -ENOENT; goto make_bad; + } leaf = path->nodes[0]; @@ -3706,7 +3749,7 @@ cache_index: * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, - * but it guarantees correctness at the expense of ocassional full + * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ @@ -3781,11 +3824,12 @@ cache_acl: } btrfs_update_iflags(inode); - return; + return 0; make_bad: btrfs_free_path(path); make_bad_inode(inode); + return ret; } /* @@ -3983,20 +4027,20 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_info(root->fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", name_len, name, ino, dir_ino); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } skip_backref: ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, dir_ino); if (ret != 0 && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto err; } @@ -4005,7 +4049,7 @@ skip_backref: if (ret == -ENOENT) ret = 0; else if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err: btrfs_free_path(path); if (ret) @@ -4119,7 +4163,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); @@ -4129,7 +4173,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir_ino, &index, name, name_len); if (ret < 0) { if (ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } di = btrfs_search_dir_index_item(root, path, dir_ino, @@ -4139,7 +4183,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = -ENOENT; else ret = PTR_ERR(di); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4152,7 +4196,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -4161,7 +4205,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); return ret; @@ -4173,6 +4217,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; + u64 last_unlink_trans; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4195,11 +4240,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (err) goto out; + last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; + /* now the directory is empty */ err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); - if (!err) + if (!err) { btrfs_i_size_write(inode, 0); + /* + * Propagate the last_unlink_trans value of the deleted dir to + * its parent directory. This is to prevent an unrecoverable + * log tree in the case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + */ + if (last_unlink_trans >= trans->transid) + BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; + } out: btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); @@ -4482,7 +4543,6 @@ search_again: pending_del_nr); if (err) { btrfs_abort_transaction(trans, - root, err); goto error; } @@ -4494,8 +4554,7 @@ search_again: item_end, new_size); if (err) { - btrfs_abort_transaction(trans, - root, err); + btrfs_abort_transaction(trans, err); goto error; } } else if (test_bit(BTRFS_ROOT_REF_COWS, @@ -4534,6 +4593,7 @@ delete: BUG_ON(ret); if (btrfs_should_throttle_delayed_refs(trans, root)) btrfs_async_run_delayed_refs(root, + trans->transid, trans->delayed_ref_updates * 2, 0); if (be_nice) { if (truncate_space_check(trans, root, @@ -4558,8 +4618,7 @@ delete: pending_del_slot, pending_del_nr); if (ret) { - btrfs_abort_transaction(trans, - root, ret); + btrfs_abort_transaction(trans, ret); goto error; } pending_del_nr = 0; @@ -4592,7 +4651,7 @@ out: ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); } error: if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) @@ -4761,7 +4820,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); return ret; } @@ -4769,7 +4828,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset, 0, 0, len, 0, len, 0, 0, 0); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); else btrfs_update_inode(trans, root, inode); btrfs_end_transaction(trans, root); @@ -4962,7 +5021,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * be instantly completed which will give us extents that need * to be truncated. If we fail to get an orphan inode down we * could have left over extents that were never meant to live, - * so we need to garuntee from this point on that everything + * so we need to guarantee from this point on that everything * will be consistent. */ ret = btrfs_orphan_add(trans, inode); @@ -4996,7 +5055,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, BTRFS_I(inode)->disk_i_size); err = btrfs_orphan_del(trans, inode); if (err) - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); btrfs_end_transaction(trans, root); } } @@ -5134,11 +5193,18 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; int steal_from_global = 0; - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 min_size; int ret; trace_btrfs_inode_evict(inode); + if (!root) { + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + return; + } + + min_size = btrfs_calc_trunc_metadata_size(root, 1); + evict_inode_truncate_pages(inode); if (inode->i_nlink && @@ -5232,14 +5298,14 @@ void btrfs_evict_inode(struct inode *inode) } /* - * We can't just steal from the global reserve, we need tomake + * We can't just steal from the global reserve, we need to make * sure there is room to do it, if not we need to commit and try * again. */ if (steal_from_global) { if (!btrfs_check_space_for_delayed_refs(trans, root)) ret = btrfs_block_rsv_migrate(global_rsv, rsv, - min_size); + min_size, 0); else ret = -ENOSPC; } @@ -5570,7 +5636,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - btrfs_read_locked_inode(inode); + int ret; + + ret = btrfs_read_locked_inode(inode); if (!is_bad_inode(inode)) { inode_tree_add(inode); unlock_new_inode(inode); @@ -5579,7 +5647,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, } else { unlock_new_inode(inode); iput(inode); - inode = ERR_PTR(-ESTALE); + ASSERT(ret < 0); + inode = ERR_PTR(ret < 0 ? ret : -ESTALE); } } @@ -5733,6 +5802,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) int name_len; int is_curr = 0; /* ctx->pos points to the current index? */ bool emitted; + bool put = false; /* FIXME, use a real flag for deciding about the key type */ if (root->fs_info->tree_root == root) @@ -5750,7 +5820,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); INIT_LIST_HEAD(&del_list); - btrfs_get_delayed_items(inode, &ins_list, &del_list); + put = btrfs_readdir_get_delayed_items(inode, &ins_list, + &del_list); } key.type = key_type; @@ -5897,8 +5968,8 @@ next: nopos: ret = 0; err: - if (key_type == BTRFS_DIR_INDEX_KEY) - btrfs_put_delayed_items(&ins_list, &del_list); + if (put) + btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); btrfs_free_path(path); return ret; } @@ -6213,9 +6284,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(root->fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(root->fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } @@ -6293,7 +6364,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; } @@ -6304,7 +6375,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode(trans, root, parent_inode); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); return ret; fail_dir_item: @@ -6964,7 +7035,18 @@ insert: * existing will always be non-NULL, since there must be * extent causing the -EEXIST. */ - if (start >= extent_map_end(existing) || + if (existing->start == em->start && + extent_map_end(existing) == extent_map_end(em) && + em->block_start == existing->block_start) { + /* + * these two extents are the same, it happens + * with inlines especially + */ + free_extent_map(em); + em = existing; + err = 0; + + } else if (start >= extent_map_end(existing) || start <= existing->start) { /* * The existing extent map is the one nearest to @@ -7129,6 +7211,43 @@ out: return em; } +static struct extent_map *btrfs_create_dio_extent(struct inode *inode, + const u64 start, + const u64 len, + const u64 orig_start, + const u64 block_start, + const u64 block_len, + const u64 orig_block_len, + const u64 ram_bytes, + const int type) +{ + struct extent_map *em = NULL; + int ret; + + down_read(&BTRFS_I(inode)->dio_sem); + if (type != BTRFS_ORDERED_NOCOW) { + em = create_pinned_em(inode, start, len, orig_start, + block_start, block_len, orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) + goto out; + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, + len, block_len, type); + if (ret) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_cache(inode, start, + start + len - 1, 0); + } + em = ERR_PTR(ret); + } + out: + up_read(&BTRFS_I(inode)->dio_sem); + + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -7139,46 +7258,18 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(root, len, root->sectorsize, 0, + ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); - /* - * Create the ordered extent before the extent map. This is to avoid - * races with the fast fsync path that would lead to it logging file - * extent items that point to disk extents that were not yet written to. - * The fast fsync path collects ordered extents into a local list and - * then collects all the new extent maps, so we must create the ordered - * extent first and make sure the fast fsync path collects any new - * ordered extents after collecting new extent maps as well. - * The fsync path simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, - ins.offset, ins.offset, 0); - if (ret) { + em = btrfs_create_dio_extent(inode, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, 0); + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + if (IS_ERR(em)) btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return ERR_PTR(ret); - } - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - oe = btrfs_lookup_ordered_extent(inode, start); - ASSERT(oe); - if (WARN_ON(!oe)) - return em; - set_bit(BTRFS_ORDERED_IOERR, &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* Once for our lookup and once for the ordered extents tree. */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - } return em; } @@ -7408,7 +7499,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, cached_state); /* * We're concerned with the entire range that we're going to be - * doing DIO to, so we need to make sure theres no ordered + * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ ordered = btrfs_lookup_ordered_range(inode, lockstart, @@ -7570,7 +7661,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (current->journal_info) { /* * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transction doesn't get + * that anything that needs to check if there's a transaction doesn't get * confused. */ dio_data = current->journal_info; @@ -7603,7 +7694,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * - * We return -ENOTBLK because thats what makes DIO go ahead and go back + * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ @@ -7650,26 +7741,30 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes) == 1) { + &orig_block_len, &ram_bytes) == 1 && + btrfs_inc_nocow_writers(root->fs_info, block_start)) { + struct extent_map *em2; + + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); + btrfs_dec_nocow_writers(root->fs_info, block_start); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - em = create_pinned_em(inode, start, len, - orig_start, - block_start, len, - orig_block_len, - ram_bytes, type); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto unlock_err; - } + em = em2; } - - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); - if (ret) { - free_extent_map(em); + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); goto unlock_err; } + /* + * For inode marked NODATACOW or extent marked PREALLOC, + * use the existing or preallocated extent, so does not + * need to adjust btrfs_space_info's bytes_may_use. + */ + btrfs_free_reserved_data_space_noquota(inode, + start, len); goto unlock; } } @@ -7704,7 +7799,6 @@ unlock: i_size_write(inode, start + len); adjust_dio_outstanding_extents(inode, dio_data, len); - btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; dio_data->unsubmitted_oe_range_end = start + len; @@ -7746,12 +7840,12 @@ err: } static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int rw, int mirror_num) + int mirror_num) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - BUG_ON(rw & REQ_WRITE); + BUG_ON(bio_op(bio) == REQ_OP_WRITE); bio_get(bio); @@ -7760,7 +7854,7 @@ static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, if (ret) goto err; - ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); + ret = btrfs_map_bio(root, bio, mirror_num, 0); err: bio_put(bio); return ret; @@ -7811,7 +7905,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, int read_mode; int ret; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); + BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) @@ -7839,13 +7933,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, free_io_failure(inode, failrec); return -EIO; } + bio_set_op_attrs(bio, REQ_OP_READ, read_mode); btrfs_debug(BTRFS_I(inode)->root->fs_info, "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = submit_dio_repair_bio(inode, bio, read_mode, - failrec->this_mirror); + ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); if (ret) { free_io_failure(inode, failrec); bio_put(bio); @@ -8135,7 +8229,7 @@ static void btrfs_endio_direct_write(struct bio *bio) bio_put(bio); } -static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 offset) { @@ -8153,8 +8247,8 @@ static void btrfs_end_dio_bio(struct bio *bio) if (err) btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, - "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio->bi_rw, + "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -8228,11 +8322,11 @@ static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, } static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, - int rw, u64 file_offset, int skip_sum, + u64 file_offset, int skip_sum, int async_submit) { struct btrfs_dio_private *dip = bio->bi_private; - int write = rw & REQ_WRITE; + bool write = bio_op(bio) == REQ_OP_WRITE; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -8253,8 +8347,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (write && async_submit) { ret = btrfs_wq_submit_bio(root->fs_info, - inode, rw, bio, 0, 0, - file_offset, + inode, bio, 0, 0, file_offset, __btrfs_submit_bio_start_direct_io, __btrfs_submit_bio_done); goto err; @@ -8273,13 +8366,13 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto err; } map: - ret = btrfs_map_bio(root, rw, bio, 0, async_submit); + ret = btrfs_map_bio(root, bio, 0, async_submit); err: bio_put(bio); return ret; } -static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, +static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int skip_sum) { struct inode *inode = dip->inode; @@ -8298,8 +8391,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, int i; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, - &map_length, NULL, 0); + ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), + start_sector << 9, &map_length, NULL, 0); if (ret) return -EIO; @@ -8319,6 +8412,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8338,7 +8432,7 @@ next_block: * before we're done setting it up */ atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, rw, + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, async_submit); if (ret) { @@ -8356,12 +8450,13 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; + bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, rw, + ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -8381,7 +8476,7 @@ next_block: } submit: - ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, async_submit); if (!ret) return 0; @@ -8401,14 +8496,14 @@ out_err: return 0; } -static void btrfs_submit_direct(int rw, struct bio *dio_bio, - struct inode *inode, loff_t file_offset) +static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, + loff_t file_offset) { struct btrfs_dio_private *dip = NULL; struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; - int write = rw & REQ_WRITE; + bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -8459,7 +8554,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dio_data->unsubmitted_oe_range_end; } - ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + ret = btrfs_submit_direct_hook(dip, skip_sum); if (!ret) return; @@ -9019,7 +9114,7 @@ static int btrfs_truncate(struct inode *inode) return ret; /* - * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * Yes ladies and gentlemen, this is indeed ugly. The fact is we have * 3 things going on here * * 1) We need to reserve space for our orphan item and the space to @@ -9033,15 +9128,15 @@ static int btrfs_truncate(struct inode *inode) * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * - * And we need these to all be seperate. The fact is we can use alot of + * And we need these to all be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space - * we will use, so we need the truncate reservation to be seperate so it + * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode or * removing the orphan item. We also need to be able to stop the * transaction and start a new one, which means we need to be able to * update the inode several times, and we have no idea of knowing how * many times that will be, so we can't just reserve 1 item for the - * entirety of the opration, so that has to be done seperately as well. + * entirety of the operation, so that has to be done separately as well. * Then there is the orphan item, which does indeed need to be held on * to for the whole operation, and we need nobody to touch this reserved * space except the orphan code. @@ -9072,7 +9167,7 @@ static int btrfs_truncate(struct inode *inode) /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, - min_size); + min_size, 0); BUG_ON(ret); /* @@ -9112,7 +9207,7 @@ static int btrfs_truncate(struct inode *inode) } ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, - rsv, min_size); + rsv, min_size, 0); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; } @@ -9133,7 +9228,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); } - out: btrfs_free_block_rsv(root, rsv); @@ -9230,6 +9324,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->dio_sem); return inode; } @@ -9341,25 +9436,25 @@ int btrfs_init_cachep(void) btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction", sizeof(struct btrfs_transaction), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); + SLAB_MEM_SPREAD, NULL); if (!btrfs_free_space_cachep) goto fail; @@ -9387,10 +9482,281 @@ static int btrfs_getattr(struct vfsmount *mnt, return 0; } +static int btrfs_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + struct dentry *parent; + u64 old_ino = btrfs_ino(old_inode); + u64 new_ino = btrfs_ino(new_inode); + u64 old_idx = 0; + u64 new_idx = 0; + u64 root_objectid; + int ret; + bool root_log_pinned = false; + bool dest_log_pinned = false; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + + /* close the race window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&dest->fs_info->subvol_sem); + + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 12); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_notrans; + } + + /* + * We need to find a free sequence number both in the source and + * in the destination directory for the exchange. + */ + ret = btrfs_set_inode_index(new_dir, &old_idx); + if (ret) + goto out_fail; + ret = btrfs_set_inode_index(old_dir, &new_idx); + if (ret) + goto out_fail; + + BTRFS_I(old_inode)->dir_index = 0ULL; + BTRFS_I(new_inode)->dir_index = 0ULL; + + /* Reference for the source. */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(root->fs_info, trans); + } else { + btrfs_pin_log_trans(root); + root_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_ino, + btrfs_ino(new_dir), old_idx); + if (ret) + goto out_fail; + } + + /* And now for the dest. */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* force full log commit if subvolume involved. */ + btrfs_set_log_full_commit(dest->fs_info, trans); + } else { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; + ret = btrfs_insert_inode_ref(trans, root, + old_dentry->d_name.name, + old_dentry->d_name.len, + new_ino, + btrfs_ino(old_dir), new_idx); + if (ret) + goto out_fail; + } + + /* Update inode version and ctime/mtime. */ + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); + inode_inc_iversion(old_inode); + inode_inc_iversion(new_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + new_inode->i_ctime = ctime; + + if (old_dentry->d_parent != new_dentry->d_parent) { + btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); + btrfs_record_unlink_dir(trans, new_dir, new_inode, 1); + } + + /* src is a subvolume */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, + root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { /* src is an inode */ + ret = __btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, root, old_inode); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + /* dest is a subvolume */ + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { + root_objectid = BTRFS_I(new_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + } else { /* dest is an inode */ + ret = __btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (!ret) + ret = btrfs_update_inode(trans, dest, new_inode); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, old_idx); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + ret = btrfs_add_link(trans, old_dir, new_inode, + old_dentry->d_name.name, + old_dentry->d_name.len, 0, new_idx); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + + if (old_inode->i_nlink == 1) + BTRFS_I(old_inode)->dir_index = old_idx; + if (new_inode->i_nlink == 1) + BTRFS_I(new_inode)->dir_index = new_idx; + + if (root_log_pinned) { + parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + parent = old_dentry->d_parent; + btrfs_log_new_name(trans, new_inode, new_dir, parent); + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } +out_fail: + /* + * If we have pinned a log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && (root_log_pinned || dest_log_pinned)) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + if (root_log_pinned) { + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } + } + ret = btrfs_end_transaction(trans, root); +out_notrans: + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&dest->fs_info->subvol_sem); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + return ret; +} + +static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + struct dentry *dentry) +{ + int ret; + struct inode *inode; + u64 objectid; + u64 index; + + ret = btrfs_find_free_ino(root, &objectid); + if (ret) + return ret; + + inode = btrfs_new_inode(trans, root, dir, + dentry->d_name.name, + dentry->d_name.len, + btrfs_ino(dir), + objectid, + S_IFCHR | WHITEOUT_MODE, + &index); + + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + return ret; + } + + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, + WHITEOUT_DEV); + + ret = btrfs_init_inode_security(trans, inode, dir, + &dentry->d_name); + if (ret) + goto out; + + ret = btrfs_add_nondir(trans, dir, dentry, + inode, 0, index); + if (ret) + goto out; + + ret = btrfs_update_inode(trans, root, inode); +out: + unlock_new_inode(inode); + if (ret) + inode_dec_link_count(inode); + iput(inode); + + return ret; +} + static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); @@ -9399,6 +9765,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, u64 root_objectid; int ret; u64 old_ino = btrfs_ino(old_inode); + bool log_pinned = false; if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9449,15 +9816,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * We want to reserve the absolute worst case amount of items. So if * both inodes are subvols and we need to unlink them then that would * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal + * would require 5 item modifications, so we'll assume they are normal * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items * should cover the worst case number of items we'll modify. + * If our rename has the whiteout flag, we need more 5 units for the + * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item + * when selinux is enabled). */ - trans = btrfs_start_transaction(root, 11); + trans_num_items = 11; + if (flags & RENAME_WHITEOUT) + trans_num_items += 5; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_notrans; - } + ret = PTR_ERR(trans); + goto out_notrans; + } if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -9471,6 +9844,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(root->fs_info, trans); } else { + btrfs_pin_log_trans(root); + log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9478,14 +9853,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_ino(new_dir), index); if (ret) goto out_fail; - /* - * this is an ugly little race, but the rename is required - * to make sure that if we crash, the inode is either at the - * old name or the new one. pinning the log transaction lets - * us make sure we don't allow a log commit to come in after - * we unlink the name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); } inode_inc_iversion(old_dir); @@ -9512,7 +9879,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = btrfs_update_inode(trans, root, old_inode); } if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } @@ -9536,7 +9903,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, d_inode(new_dentry)); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } } @@ -9545,19 +9912,53 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_dentry->d_name.name, new_dentry->d_name.len, 0, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_fail; } if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (log_pinned) { struct dentry *parent = new_dentry->d_parent; + btrfs_log_new_name(trans, old_inode, old_dir, parent); btrfs_end_log_trans(root); + log_pinned = false; + } + + if (flags & RENAME_WHITEOUT) { + ret = btrfs_whiteout_for_rename(trans, root, old_dir, + old_dentry); + + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } out_fail: + /* + * If we have pinned the log and an error happened, we unpin tasks + * trying to sync the log and force them to fallback to a transaction + * commit if the log currently contains any of the inodes involved in + * this rename operation (to ensure we do not persist a log with an + * inconsistent state for any of these inodes or leading to any + * inconsistencies when replayed). If the transaction was aborted, the + * abortion reason is propagated to userspace when attempting to commit + * the transaction. If the log does not contain any of these inodes, we + * allow the tasks to sync it. + */ + if (ret && log_pinned) { + if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || + btrfs_inode_in_log(new_dir, root->fs_info->generation) || + btrfs_inode_in_log(old_inode, root->fs_info->generation) || + (new_inode && + btrfs_inode_in_log(new_inode, root->fs_info->generation))) + btrfs_set_log_full_commit(root->fs_info, trans); + + btrfs_end_log_trans(root); + log_pinned = false; + } btrfs_end_transaction(trans, root); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9570,10 +9971,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~RENAME_NOREPLACE) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); + if (flags & RENAME_EXCHANGE) + return btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + + return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } static void btrfs_run_delalloc_work(struct btrfs_work *work) @@ -9914,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; + u64 end = start + num_bytes - 1; if (trans) own_trans = false; @@ -9935,13 +10341,14 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, * sized chunks. */ cur_bytes = min(cur_bytes, last_alloc); - ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, - *alloc_hint, &ins, 1, 0); + ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, + min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) btrfs_end_transaction(trans, root); break; } + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, @@ -9952,7 +10359,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; @@ -10012,7 +10419,7 @@ next: ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans, root); break; @@ -10021,6 +10428,9 @@ next: if (own_trans) btrfs_end_transaction(trans, root); } + if (cur_offset < end) + btrfs_free_reserved_data_space(inode, cur_offset, + end - cur_offset + 1); return ret; } @@ -10160,10 +10570,10 @@ static const struct inode_operations btrfs_dir_inode_operations = { .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .permission = btrfs_permission, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, @@ -10184,7 +10594,7 @@ static const struct file_operations btrfs_dir_file_operations = { .iterate_shared = btrfs_real_readdir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = btrfs_ioctl, + .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, @@ -10237,10 +10647,10 @@ static const struct address_space_operations btrfs_symlink_aops = { static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_acl = btrfs_get_acl, @@ -10251,10 +10661,10 @@ static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, @@ -10265,10 +10675,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, - .setxattr = btrfs_setxattr, + .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, - .removexattr = btrfs_removexattr, + .removexattr = generic_removexattr, .update_time = btrfs_update_time, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b8ba717175b..7fd939bfbd99 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -125,10 +125,10 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags) if (flags & BTRFS_INODE_NODATACOW) iflags |= FS_NOCOW_FL; - if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) - iflags |= FS_COMPR_FL; - else if (flags & BTRFS_INODE_NOCOMPRESS) + if (flags & BTRFS_INODE_NOCOMPRESS) iflags |= FS_NOCOMP_FL; + else if (flags & BTRFS_INODE_COMPRESS) + iflags |= FS_COMPR_FL; return iflags; } @@ -296,7 +296,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } else { /* - * Revert back under same assuptions as above + * Revert back under same assumptions as above */ if (S_ISREG(mode)) { if (inode->i_size == 0) @@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_key key; - struct btrfs_root_item root_item; + struct btrfs_root_item *root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir, u64 qgroup_reserved; uuid_le new_uuid; + root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); + if (!root_item) + return -ENOMEM; + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) - return ret; + goto fail_free; /* * Don't create subvolume whose level is not zero. Or qgroup will be - * screwed up since it assume subvolme qgroup's level to be 0. + * screwed up since it assumes subvolume qgroup's level to be 0. */ - if (btrfs_qgroup_level(objectid)) - return -ENOSPC; + if (btrfs_qgroup_level(objectid)) { + ret = -ENOSPC; + goto fail_free; + } btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, &qgroup_reserved, false); if (ret) - return ret; + goto fail_free; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); - return ret; + goto fail_free; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; @@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); - memset(&root_item, 0, sizeof(root_item)); - - inode_item = &root_item.inode; + inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); - btrfs_set_root_flags(&root_item, 0); - btrfs_set_root_limit(&root_item, 0); + btrfs_set_root_flags(root_item, 0); + btrfs_set_root_limit(root_item, 0); btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); - btrfs_set_root_bytenr(&root_item, leaf->start); - btrfs_set_root_generation(&root_item, trans->transid); - btrfs_set_root_level(&root_item, 0); - btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, leaf->len); - btrfs_set_root_last_snapshot(&root_item, 0); + btrfs_set_root_bytenr(root_item, leaf->start); + btrfs_set_root_generation(root_item, trans->transid); + btrfs_set_root_level(root_item, 0); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_used(root_item, leaf->len); + btrfs_set_root_last_snapshot(root_item, 0); - btrfs_set_root_generation_v2(&root_item, - btrfs_root_generation(&root_item)); + btrfs_set_root_generation_v2(root_item, + btrfs_root_generation(root_item)); uuid_le_gen(&new_uuid); - memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); - btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); - btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); - root_item.ctime = root_item.otime; - btrfs_set_root_ctransid(&root_item, trans->transid); - btrfs_set_root_otransid(&root_item, trans->transid); + memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE); + btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec); + btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec); + root_item->ctime = root_item->otime; + btrfs_set_root_ctransid(root_item, trans->transid); + btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); leaf = NULL; - btrfs_set_root_dirid(&root_item, new_dirid); + btrfs_set_root_dirid(root_item, new_dirid); key.objectid = objectid; key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - &root_item); + root_item); if (ret) goto fail; @@ -557,7 +561,7 @@ static noinline int create_subvol(struct inode *dir, new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -566,7 +570,7 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid); if (ret) { /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -579,7 +583,7 @@ static noinline int create_subvol(struct inode *dir, */ ret = btrfs_set_inode_index(dir, &index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -587,7 +591,7 @@ static noinline int create_subvol(struct inode *dir, name, namelen, dir, &key, BTRFS_FT_DIR, index); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir, BUG_ON(ret); ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, - root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); fail: + kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); @@ -629,6 +634,10 @@ fail: d_instantiate(dentry, inode); } return ret; + +fail_free: + kfree(root_item); + return ret; } static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) @@ -681,7 +690,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto dec_and_free; - btrfs_wait_ordered_extents(root, -1); + btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); @@ -771,7 +780,7 @@ free_pending: * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability - * 6. If the victim is append-only or immutable we can't do antyhing with + * 6. If the victim is append-only or immutable we can't do anything with * links pointing to it. * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. @@ -837,11 +846,9 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - inode_lock_nested(dir, I_MUTEX_PARENT); - // XXX: should've been - // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - // if (error == -EINTR) - // return error; + error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (error == -EINTR) + return error; dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -1230,7 +1237,7 @@ again: set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, - &cached_state, GFP_NOFS); + &cached_state); unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, &cached_state, @@ -1627,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, int namelen; int ret = 0; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + ret = mnt_want_write_file(file); if (ret) goto out; @@ -1684,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, struct btrfs_ioctl_vol_args *vol_args; int ret; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1707,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, bool readonly = false; struct btrfs_qgroup_inherit *inherit = NULL; + if (!S_ISDIR(file_inode(file)->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1941,8 +1957,7 @@ static noinline int key_in_sk(struct btrfs_key *key, return 1; } -static noinline int copy_to_sk(struct btrfs_root *root, - struct btrfs_path *path, +static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, size_t *buf_size, @@ -2113,7 +2128,7 @@ static noinline int search_ioctl(struct inode *inode, ret = 0; goto err; } - ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf, + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); if (ret) @@ -2351,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int ret; int err = 0; + if (!S_ISDIR(dir->i_mode)) + return -ENOTDIR; + vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -2368,11 +2386,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; - inode_lock_nested(dir, I_MUTEX_PARENT); - // XXX: should've been - // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - // if (err == -EINTR) - // goto out_drop_write; + err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); + if (err == -EINTR) + goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2401,7 +2417,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * rmdir(2). */ err = -EPERM; - if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; /* @@ -2484,7 +2500,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dentry->d_name.len); if (ret) { err = ret; - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_end_trans; } @@ -2500,7 +2516,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, root->fs_info->tree_root, dest->root_key.objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2510,7 +2526,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2520,7 +2536,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); err = ret; goto out_end_trans; } @@ -2562,7 +2578,7 @@ out_dput: dput(dentry); out_unlock_dir: inode_unlock(dir); -//out_drop_write: +out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); @@ -2671,10 +2687,10 @@ out: return ret; } -static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; - struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -2690,7 +2706,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) goto err_drop; } - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + /* Check for compatibility reject unknown flags */ + if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) + return -EOPNOTSUPP; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { @@ -2699,13 +2717,23 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } mutex_lock(&root->fs_info->volume_mutex); - ret = btrfs_rm_device(root, vol_args->name); + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + ret = btrfs_rm_device(root, NULL, vol_args->devid); + } else { + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name, 0); + } mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); - if (!ret) - btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); - + if (!ret) { + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) + btrfs_info(root->fs_info, "device deleted: id %llu", + vol_args->devid); + else + btrfs_info(root->fs_info, "device deleted: %s", + vol_args->name); + } out: kfree(vol_args); err_drop: @@ -2713,6 +2741,47 @@ err_drop: return ret; } +static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) +{ + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, + 1)) { + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_drop_write; + } + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) { + ret = PTR_ERR(vol_args); + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + mutex_lock(&root->fs_info->volume_mutex); + ret = btrfs_rm_device(root, vol_args->name, 0); + mutex_unlock(&root->fs_info->volume_mutex); + + if (!ret) + btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); + kfree(vol_args); +out: + atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); +out_drop_write: + mnt_drop_write_file(file); + + return ret; +} + static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -3234,7 +3303,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3472,13 +3541,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = vmalloc(root->nodesize); - if (!buf) - return ret; + buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN); + if (!buf) { + buf = vmalloc(root->nodesize); + if (!buf) + return ret; + } path = btrfs_alloc_path(); if (!path) { - vfree(buf); + kvfree(buf); return ret; } @@ -3633,7 +3705,7 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); + ret); btrfs_end_transaction(trans, root); goto out; } @@ -3641,8 +3713,7 @@ process_slot: ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); if (ret) { - btrfs_abort_transaction(trans, root, - ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3674,7 +3745,6 @@ process_slot: new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); @@ -3711,7 +3781,6 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); btrfs_end_transaction(trans, root); goto out; @@ -3767,7 +3836,7 @@ process_slot: last_dest_end, destoff + len, 1); if (ret) { if (ret != -EOPNOTSUPP) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans, root); goto out; } @@ -3779,7 +3848,7 @@ process_slot: out: btrfs_free_path(path); - vfree(buf); + kvfree(buf); return ret; } @@ -4380,7 +4449,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { - ret = btrfs_dev_replace_start(root, p); + ret = btrfs_dev_replace_by_ioctl(root, p); atomic_set( &root->fs_info->mutually_exclusive_operation_running, 0); @@ -4589,7 +4658,7 @@ again: } /* - * mut. excl. ops lock is locked. Three possibilites: + * mut. excl. ops lock is locked. Three possibilities: * (1) some other op is running * (2) balance is running * (3) balance is paused -- special case (think resume) @@ -4851,8 +4920,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_std_error(root->fs_info, ret, - "failed to update qgroup status and info\n"); + btrfs_handle_fs_error(root->fs_info, err, + "failed to update qgroup status and info"); err = btrfs_end_transaction(trans, root); if (err && !ret) ret = err; @@ -5027,7 +5096,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_qgroup_wait_for_completion(root->fs_info); + return btrfs_qgroup_wait_for_completion(root->fs_info, true); } static long _btrfs_ioctl_set_received_subvol(struct file *file, @@ -5103,13 +5172,13 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_commit_transaction(trans, root); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -5398,9 +5467,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) if (ret) return ret; + ret = mnt_want_write_file(file); + if (ret) + return ret; + trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_drop_write; + } spin_lock(&root->fs_info->super_lock); newflags = btrfs_super_compat_flags(super_block); @@ -5419,7 +5494,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) btrfs_set_super_incompat_flags(super_block, newflags); spin_unlock(&root->fs_info->super_lock); - return btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); +out_drop_write: + mnt_drop_write_file(file); + + return ret; } long btrfs_ioctl(struct file *file, unsigned int @@ -5463,6 +5542,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(file, argp); + case BTRFS_IOC_RM_DEV_V2: + return btrfs_ioctl_rm_dev_v2(file, argp); case BTRFS_IOC_FS_INFO: return btrfs_ioctl_fs_info(root, argp); case BTRFS_IOC_DEV_INFO: @@ -5494,7 +5575,7 @@ long btrfs_ioctl(struct file *file, unsigned int ret = btrfs_sync_fs(file_inode(file)->i_sb, 1); /* * The transaction thread may want to do more work, - * namely it pokes the cleaner ktread that will start + * namely it pokes the cleaner kthread that will start * processing uncleaned subvols. */ wake_up_process(root->fs_info->transaction_kthread); @@ -5556,3 +5637,24 @@ long btrfs_ioctl(struct file *file, unsigned int return -ENOTTY; } + +#ifdef CONFIG_COMPAT +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + case FS_IOC32_GETVERSION: + cmd = FS_IOC_GETVERSION; + break; + default: + return -ENOIOCTLCMD; + } + + return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 0de7da5a610d..3b78d38173b3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len) { - struct list_head splice, works; + LIST_HEAD(splice); + LIST_HEAD(skipped); + LIST_HEAD(works); struct btrfs_ordered_extent *ordered, *next; int count = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); + const u64 range_end = range_start + range_len; mutex_lock(&root->ordered_extent_mutex); spin_lock(&root->ordered_extent_lock); @@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) while (!list_empty(&splice) && nr) { ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); + + if (range_end <= ordered->start || + ordered->start + ordered->disk_len <= range_start) { + list_move_tail(&ordered->root_extent_list, &skipped); + cond_resched_lock(&root->ordered_extent_lock); + continue; + } + list_move_tail(&ordered->root_extent_list, &root->ordered_extents); atomic_inc(&ordered->refs); @@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) nr--; count++; } + list_splice_tail(&skipped, &root->ordered_extents); list_splice_tail(&splice, &root->ordered_extents); spin_unlock(&root->ordered_extent_lock); @@ -708,11 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) return count; } -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; int done; + int total_done = 0; INIT_LIST_HEAD(&splice); @@ -728,8 +740,10 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); - done = btrfs_wait_ordered_extents(root, nr); + done = btrfs_wait_ordered_extents(root, nr, + range_start, range_len); btrfs_put_fs_root(root); + total_done += done; spin_lock(&fs_info->ordered_root_lock); if (nr != -1) { @@ -740,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); mutex_unlock(&fs_info->ordered_operations_mutex); + + return total_done; } /* @@ -952,6 +968,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct rb_node *prev = NULL; struct btrfs_ordered_extent *test; int ret = 1; + u64 orig_offset = offset; spin_lock_irq(&tree->lock); if (ordered) { @@ -967,7 +984,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, /* truncate file */ if (disk_i_size > i_size) { - BTRFS_I(inode)->disk_i_size = i_size; + BTRFS_I(inode)->disk_i_size = orig_offset; ret = 0; goto out; } @@ -1105,7 +1122,7 @@ int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", sizeof(struct btrfs_ordered_extent), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD, NULL); if (!btrfs_ordered_extent_cache) return -ENOMEM; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 23c96059cef2..451507776ff5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -58,7 +58,7 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */ -#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to preallocated extent */ #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ @@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, + const u64 range_start, const u64 range_len); +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, + const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct inode *inode, struct list_head *logged_list, const loff_t start, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 36992128c746..cf0b444ac4f3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -350,6 +350,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_root *parent_root) { + struct super_block *sb = root->fs_info->sb; struct btrfs_key key; struct inode *parent_inode, *child_inode; int ret; @@ -358,12 +359,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - parent_inode = btrfs_iget(parent_root->fs_info->sb, &key, - parent_root, NULL); + parent_inode = btrfs_iget(sb, &key, parent_root, NULL); if (IS_ERR(parent_inode)) return PTR_ERR(parent_inode); - child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + child_inode = btrfs_iget(sb, &key, root, NULL); if (IS_ERR(child_inode)) { iput(parent_inode); return PTR_ERR(child_inode); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9e119552ed32..8db2e29fdcf4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -85,7 +85,7 @@ struct btrfs_qgroup { /* * temp variables for accounting operations - * Refer to qgroup_shared_accouting() for details. + * Refer to qgroup_shared_accounting() for details. */ u64 old_refcnt; u64 new_refcnt; @@ -499,7 +499,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) } /* * we call btrfs_free_qgroup_config() when umounting - * filesystem and disabling quota, so we set qgroup_ulit + * filesystem and disabling quota, so we set qgroup_ulist * to be null here to avoid double free. */ ulist_free(fs_info->qgroup_ulist); @@ -571,7 +571,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; - if (btrfs_test_is_dummy_root(quota_root)) + if (btrfs_is_testing(quota_root->fs_info)) return 0; path = btrfs_alloc_path(); @@ -728,7 +728,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; - if (btrfs_test_is_dummy_root(root)) + if (btrfs_is_testing(root->fs_info)) return 0; key.objectid = 0; @@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, goto out; fs_info->quota_enabled = 0; fs_info->pending_quota_state = 0; - btrfs_qgroup_wait_for_completion(fs_info); + btrfs_qgroup_wait_for_completion(fs_info, false); spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; fs_info->quota_root = NULL; @@ -1036,7 +1036,7 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, /* * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their refrence and + * then this qgroup and all of the parent qgroups get their reference and * exclusive counts adjusted. * * Caller should hold fs_info->qgroup_lock. @@ -1436,7 +1436,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, /* * No need to do lock, since this function will only be called in - * btrfs_commmit_transaction(). + * btrfs_commit_transaction(). */ node = rb_first(&delayed_refs->dirty_extent_root); while (node) { @@ -1453,9 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record) +int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; struct rb_node *parent_node = NULL; @@ -1463,7 +1463,7 @@ struct btrfs_qgroup_extent_record u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); - trace_btrfs_qgroup_insert_dirty_extent(record); + trace_btrfs_qgroup_insert_dirty_extent(fs_info, record); while (*p) { parent_node = *p; @@ -1474,12 +1474,42 @@ struct btrfs_qgroup_extent_record else if (bytenr > entry->bytenr) p = &(*p)->rb_right; else - return entry; + return 1; } rb_link_node(&record->node, parent_node, p); rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); - return NULL; + return 0; +} + +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag) +{ + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0) + return 0; + if (WARN_ON(trans == NULL)) + return -EINVAL; + record = kmalloc(sizeof(*record), gfp_flag); + if (!record) + return -ENOMEM; + + delayed_refs = &trans->transaction->delayed_refs; + record->bytenr = bytenr; + record->num_bytes = num_bytes; + record->old_roots = NULL; + + spin_lock(&delayed_refs->lock); + ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs, + record); + spin_unlock(&delayed_refs->lock); + if (ret > 0) + kfree(record); + return 0; } #define UPDATE_NEW 0 @@ -1557,7 +1587,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * A: cur_old_roots < nr_old_roots (not exclusive before) * !A: cur_old_roots == nr_old_roots (possible exclusive before) * B: cur_new_roots < nr_new_roots (not exclusive now) - * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * !B: cur_new_roots == nr_new_roots (possible exclusive now) * * Results: * +: Possible sharing -> exclusive -: Possible exclusive -> sharing @@ -1595,8 +1625,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(qg->qgroupid, cur_old_count, - cur_new_count); + trace_qgroup_update_counters(fs_info, qg->qgroupid, + cur_old_count, cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -1687,8 +1717,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, goto out_free; BUG_ON(!fs_info->quota_root); - trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots, - nr_new_roots); + trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes, + nr_old_roots, nr_new_roots); qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { @@ -1759,7 +1789,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record = rb_entry(node, struct btrfs_qgroup_extent_record, node); - trace_btrfs_qgroup_account_extents(record); + trace_btrfs_qgroup_account_extents(fs_info, record); if (!ret) { /* @@ -1851,7 +1881,7 @@ out: } /* - * Copy the acounting information between qgroups. This is necessary + * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will * cause a transaction abort so we take extra care here to only error * when a readonly fs is a reasonable outcome. @@ -2195,7 +2225,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) return; - btrfs_err(trans->root->fs_info, + btrfs_err(trans->fs_info, "qgroups not uptodate in trans handle %p: list is%s empty, " "seq is %#x.%x", trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", @@ -2302,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) int err = -ENOMEM; int ret = 0; + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = true; + mutex_unlock(&fs_info->qgroup_rescan_lock); + path = btrfs_alloc_path(); if (!path) goto out; @@ -2340,7 +2374,7 @@ out: mutex_unlock(&fs_info->qgroup_rescan_lock); /* - * only update status, since the previous part has alreay updated the + * only update status, since the previous part has already updated the * qgroup info. */ trans = btrfs_start_transaction(fs_info->quota_root, 1); @@ -2368,6 +2402,9 @@ out: } done: + mutex_lock(&fs_info->qgroup_rescan_lock); + fs_info->qgroup_rescan_running = false; + mutex_unlock(&fs_info->qgroup_rescan_lock); complete_all(&fs_info->qgroup_rescan_completion); } @@ -2486,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info) +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible) { int running; int ret = 0; mutex_lock(&fs_info->qgroup_rescan_lock); spin_lock(&fs_info->qgroup_lock); - running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN; + running = fs_info->qgroup_rescan_running; spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); - if (running) + if (!running) + return 0; + + if (interruptible) ret = wait_for_completion_interruptible( &fs_info->qgroup_rescan_completion); + else + wait_for_completion(&fs_info->qgroup_rescan_completion); return ret; } @@ -2542,8 +2585,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) changeset.bytes_changed = 0; changeset.range_changed = ulist_alloc(GFP_NOFS); ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); trace_btrfs_qgroup_reserve_data(inode, start, len, changeset.bytes_changed, QGROUP_RESERVE); @@ -2580,8 +2622,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, return -ENOMEM; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, - &changeset); + start + len -1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -2672,7 +2713,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) } /* - * Check qgroup reserved space leaking, normally at destory inode + * Check qgroup reserved space leaking, normally at destroy inode * time */ void btrfs_qgroup_check_reserved_leak(struct inode *inode) @@ -2688,7 +2729,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode) return; ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset); + EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index ecb2c143ef75..1bc64c864b62 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); -int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info); +int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, + bool interruptible); int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, @@ -63,9 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -struct btrfs_qgroup_extent_record -*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_qgroup_extent_record *record); +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * No lock version, caller must acquire delayed ref lock and allocate memory. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ +int btrfs_qgroup_insert_dirty_extent_nolock( + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); + +/* + * Insert one dirty extent record into @delayed_refs, informing qgroup to + * account that extent at commit trans time. + * + * Better encapsulated version. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ +int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, + gfp_t gfp_flag); + int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -88,7 +115,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); - trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes); + trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); } void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0b7792e02dd5..cd8d302a1f61 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -576,7 +576,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, * we can't merge with cached rbios, since the * idea is that when we merge the destination * rbio is going to run our IO for us. We can - * steal from cached rbio's though, other functions + * steal from cached rbios though, other functions * handle that. */ if (test_bit(RBIO_CACHE_BIT, &last->flags) || @@ -1320,7 +1320,9 @@ write_data: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + submit_bio(bio); } return; @@ -1573,11 +1575,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_rmw_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return 0; @@ -2097,11 +2100,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid_recover_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } out: return 0; @@ -2368,7 +2372,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } - /* Check scrubbing pairty and repair it */ + /* Check scrubbing parity and repair it */ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); parity = kmap(p); if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) @@ -2433,7 +2437,9 @@ submit_write: bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - submit_bio(WRITE, bio); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + submit_bio(bio); } return; @@ -2493,7 +2499,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) /* * Here means we got one corrupted data stripe and one * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckly, use the other one to repair + * is scrubbing parity, luckily, use the other one to repair * the data, or we can not repair the data stripe. */ if (failp != rbio->scrubp) @@ -2610,11 +2616,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_private = rbio; bio->bi_end_io = raid56_parity_scrub_end_io; + bio_set_op_attrs(bio, REQ_OP_READ, 0); btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); - submit_bio(READ, bio); + submit_bio(bio); } /* the actual write will happen once the reads are done */ return; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 298631eaee78..8428db7cd88f 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -761,12 +761,14 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) do { enqueued = 0; + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(fs_info, device); } + mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 08ef890deca6..c0c13dc6fe12 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -31,6 +31,7 @@ #include "async-thread.h" #include "free-space-cache.h" #include "inode-map.h" +#include "qgroup.h" /* * backref_node, mapping_node and tree_block start with this @@ -235,12 +236,12 @@ static void backref_cache_cleanup(struct backref_cache *cache) cache->last_trans = 0; for (i = 0; i < BTRFS_MAX_LEVEL; i++) - BUG_ON(!list_empty(&cache->pending[i])); - BUG_ON(!list_empty(&cache->changed)); - BUG_ON(!list_empty(&cache->detached)); - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); - BUG_ON(cache->nr_nodes); - BUG_ON(cache->nr_edges); + ASSERT(list_empty(&cache->pending[i])); + ASSERT(list_empty(&cache->changed)); + ASSERT(list_empty(&cache->detached)); + ASSERT(RB_EMPTY_ROOT(&cache->rb_root)); + ASSERT(!cache->nr_nodes); + ASSERT(!cache->nr_edges); } static struct backref_node *alloc_backref_node(struct backref_cache *cache) @@ -668,8 +669,8 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, * roots of b-trees that reference the tree block. * * the basic idea of this function is check backrefs of a given block - * to find upper level blocks that refernece the block, and then check - * bakcrefs of these upper level blocks recursively. the recursion stop + * to find upper level blocks that reference the block, and then check + * backrefs of these upper level blocks recursively. the recursion stop * when tree root is reached or backrefs for the block is cached. * * NOTE: if we find backrefs for a block are cached, we know backrefs @@ -1160,7 +1161,7 @@ out: if (!RB_EMPTY_NODE(&upper->rb_node)) continue; - /* Add this guy's upper edges to the list to proces */ + /* Add this guy's upper edges to the list to process */ list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); if (list_empty(&upper->upper)) @@ -1171,8 +1172,12 @@ out: lower = list_entry(useless.next, struct backref_node, list); list_del_init(&lower->list); + if (lower == node) + node = NULL; free_backref_node(cache, lower); } + + free_backref_node(cache, node); return ERR_PTR(err); } ASSERT(!node || !node->detached); @@ -1719,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } @@ -1727,7 +1732,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, parent, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); break; } } @@ -2396,7 +2401,7 @@ again: } /* - * we keep the old last snapshod transid in rtranid when we + * we keep the old last snapshot transid in rtranid when we * created the relocation tree. */ last_snap = btrfs_root_rtransid(&reloc_root->root_item); @@ -2418,7 +2423,7 @@ again: } out: if (ret) { - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -2604,25 +2609,28 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, trans->block_rsv = rc->block_rsv; rc->reserved_bytes += num_bytes; + + /* + * We are under a transaction here so we can only do limited flushing. + * If we get an enospc just kick back -EAGAIN so we know to drop the + * transaction and try to refill when we can flush all the things. + */ ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, - BTRFS_RESERVE_FLUSH_ALL); + BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { - if (ret == -EAGAIN) { - tmp = rc->extent_root->nodesize * - RELOCATION_RESERVED_NODES; - while (tmp <= rc->reserved_bytes) - tmp <<= 1; - /* - * only one thread can access block_rsv at this point, - * so we don't need hold lock to protect block_rsv. - * we expand more reservation size here to allow enough - * space for relocation and we will return eailer in - * enospc case. - */ - rc->block_rsv->size = tmp + rc->extent_root->nodesize * - RELOCATION_RESERVED_NODES; - } - return ret; + tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES; + while (tmp <= rc->reserved_bytes) + tmp <<= 1; + /* + * only one thread can access block_rsv at this point, + * so we don't need hold lock to protect block_rsv. + * we expand more reservation size here to allow enough + * space for relocation and we will return eailer in + * enospc case. + */ + rc->block_rsv->size = tmp + rc->extent_root->nodesize * + RELOCATION_RESERVED_NODES; + return -EAGAIN; } return 0; @@ -2814,7 +2822,7 @@ static void mark_block_processed(struct reloc_control *rc, u64 bytenr, u32 blocksize) { set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, - EXTENT_DIRTY, GFP_NOFS); + EXTENT_DIRTY); } static void __mark_block_processed(struct reloc_control *rc, @@ -3030,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode, u64 num_bytes; int nr = 0; int ret = 0; + u64 prealloc_start = cluster->start - offset; + u64 prealloc_end = cluster->end - offset; + u64 cur_offset; BUG_ON(cluster->start != cluster->boundary[0]); inode_lock(inode); - ret = btrfs_check_data_free_space(inode, cluster->start, - cluster->end + 1 - cluster->start); + ret = btrfs_check_data_free_space(inode, prealloc_start, + prealloc_end + 1 - prealloc_start); if (ret) goto out; + cur_offset = prealloc_start; while (nr < cluster->nr) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -3048,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode, lock_extent(&BTRFS_I(inode)->io_tree, start, end); num_bytes = end + 1 - start; + if (cur_offset < start) + btrfs_free_reserved_data_space(inode, cur_offset, + start - cur_offset); ret = btrfs_prealloc_file_range(inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); + cur_offset = end + 1; unlock_extent(&BTRFS_I(inode)->io_tree, start, end); if (ret) break; nr++; } - btrfs_free_reserved_data_space(inode, cluster->start, - cluster->end + 1 - cluster->start); + if (cur_offset < prealloc_end) + btrfs_free_reserved_data_space(inode, cur_offset, + prealloc_end + 1 - cur_offset); out: inode_unlock(inode); return ret; @@ -3182,7 +3199,7 @@ static int relocate_file_extent_cluster(struct inode *inode, page_start + offset == cluster->boundary[nr]) { set_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_BOUNDARY, GFP_NOFS); + EXTENT_BOUNDARY); nr++; } @@ -3871,6 +3888,7 @@ static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { struct btrfs_trans_handle *trans; + int ret; rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, BTRFS_BLOCK_RSV_TEMP); @@ -3885,6 +3903,11 @@ int prepare_to_relocate(struct reloc_control *rc) rc->reserved_bytes = 0; rc->block_rsv->size = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES; + ret = btrfs_block_rsv_refill(rc->extent_root, + rc->block_rsv, rc->block_rsv->size, + BTRFS_RESERVE_FLUSH_ALL); + if (ret) + return ret; rc->create_reloc_tree = 1; set_reloc_control(rc); @@ -3903,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc) return 0; } +/* + * Qgroup fixer for data chunk relocation. + * The data relocation is done in the following steps + * 1) Copy data extents into data reloc tree + * 2) Create tree reloc tree(special snapshot) for related subvolumes + * 3) Modify file extents in tree reloc tree + * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks + * + * The problem is, data and tree reloc tree are not accounted to qgroup, + * and 4) will only info qgroup to track tree blocks change, not file extents + * in the tree blocks. + * + * The good news is, related data extents are all in data reloc tree, so we + * only need to info qgroup to track all file extents in data reloc tree + * before commit trans. + */ +static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + struct inode *inode = rc->data_inode; + struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + if (!fs_info->quota_enabled) + return 0; + + /* + * Only for stage where we update data pointers the qgroup fix is + * valid. + * For MOVING_DATA stage, we will miss the timing of swapping tree + * blocks, and won't fix it. + */ + if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1); + while (1) { + struct btrfs_file_extent_item *fi; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid > btrfs_ino(inode)) + break; + if (key.type != BTRFS_EXTENT_DATA_KEY) + goto next; + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(path->nodes[0], fi) != + BTRFS_FILE_EXTENT_REG) + goto next; + ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info, + btrfs_file_extent_disk_bytenr(path->nodes[0], fi), + btrfs_file_extent_disk_num_bytes(path->nodes[0], fi), + GFP_NOFS); + if (ret < 0) + break; +next: + ret = btrfs_next_item(data_reloc_root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; + } + } + unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1); +out: + btrfs_free_path(path); + return ret; +} + static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; @@ -4059,8 +4166,7 @@ restart: } btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans, rc->extent_root); @@ -4090,10 +4196,18 @@ restart: /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { err = PTR_ERR(trans); - else - btrfs_commit_transaction(trans, rc->extent_root); + goto out_free; + } + ret = qgroup_fix_relocated_data_extents(trans, rc); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + if (!err) + err = ret; + goto out_free; + } + btrfs_commit_transaction(trans, rc->extent_root); out_free: btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); btrfs_free_path(path); @@ -4254,12 +4368,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", rc->block_group->key.objectid, rc->block_group->flags); - ret = btrfs_start_delalloc_roots(fs_info, 0, -1); - if (ret < 0) { - err = ret; - goto out; - } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_block_group_reservations(rc->block_group); + btrfs_wait_nocow_writers(rc->block_group); + btrfs_wait_ordered_roots(fs_info, -1, + rc->block_group->key.objectid, + rc->block_group->key.offset); while (1) { mutex_lock(&fs_info->cleaner_mutex); @@ -4457,10 +4570,16 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { err = PTR_ERR(trans); - else - err = btrfs_commit_transaction(trans, rc->extent_root); + goto out_free; + } + err = qgroup_fix_relocated_data_extents(trans, rc); + if (err < 0) { + btrfs_abort_transaction(trans, err); + goto out_free; + } + err = btrfs_commit_transaction(trans, rc->extent_root); out_free: kfree(rc); out: @@ -4592,7 +4711,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, /* * called before creating snapshot. it calculates metadata reservation - * requried for relocating tree blocks in the snapshot + * required for relocating tree blocks in the snapshot */ void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve) @@ -4645,7 +4764,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, if (rc->merge_reloc_tree) { ret = btrfs_block_rsv_migrate(&pending->block_rsv, rc->block_rsv, - rc->nodes_relocated); + rc->nodes_relocated, 1); if (ret) return ret; } diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 9fcd6dfc3266..091296062456 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -71,9 +71,9 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, * search_key: the key to search * path: the path we search * root_item: the root item of the tree we look for - * root_key: the reak key of the tree we look for + * root_key: the root key of the tree we look for * - * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset + * If ->offset of 'search_key' is -1ULL, it means we are not sure the offset * of the search key, just lookup the root with the highest offset for a * given objectid. * @@ -150,7 +150,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, 0, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -176,20 +176,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root ret = btrfs_search_slot(trans, root, key, path, -1, 1); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_del_item(trans, root, path); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); ret = btrfs_insert_empty_item(trans, root, path, key, sizeof(*item)); if (ret < 0) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } l = path->nodes[0]; @@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid = key.offset; key.offset++; + /* + * The root might have been inserted already, as before we look + * for orphan roots, log replay might have happened, which + * triggers a transaction commit and qgroup accounting, which + * in turn reads and inserts fs roots while doing backref + * walking. + */ + root = btrfs_lookup_fs_root(tree_root->fs_info, + root_key.objectid); + if (root) { + WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, + &root->state)); + if (btrfs_root_refs(&root->root_item) == 0) + btrfs_add_dead_root(root); + continue; + } + root = btrfs_read_fs_root(tree_root, &root_key); err = PTR_ERR_OR_ZERO(root); if (err && err != -ENOENT) { @@ -284,7 +301,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to start trans to delete " "orphan item"); break; @@ -293,7 +310,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid); btrfs_end_transaction(trans, tree_root); if (err) { - btrfs_std_error(tree_root->fs_info, err, + btrfs_handle_fs_error(tree_root->fs_info, err, "Failed to delete root orphan " "item"); break; @@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); err = btrfs_insert_fs_root(root->fs_info, root); - /* - * The root might have been inserted already, as before we look - * for orphan roots, log replay might have happened, which - * triggers a transaction commit and qgroup accounting, which - * in turn reads and inserts fs roots while doing backref - * walking. - */ - if (err == -EEXIST) - err = 0; if (err) { + BUG_ON(err == -EEXIST); btrfs_free_fs_root(root); break; } @@ -448,7 +457,7 @@ again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); if (ret) { - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4678f03e878e..1d195d2b32c6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -745,7 +745,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) * sure we read the bad mirror. */ ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); if (ret) { /* set_extent_bits should give proper error */ WARN_ON(ret > 0); @@ -763,7 +763,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) end, EXTENT_DAMAGED, 0, NULL); if (!corrected) clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, - EXTENT_DAMAGED, GFP_NOFS); + EXTENT_DAMAGED); } out: @@ -1044,7 +1044,7 @@ nodatasum_case: /* * !is_metadata and !have_csum, this means that the data - * might not be COW'ed, that it might be modified + * might not be COWed, that it might be modified * concurrently. The general strategy to work on the * commit root does not help in the case when COW is not * used. @@ -1125,7 +1125,7 @@ nodatasum_case: * the 2nd page of mirror #1 faces I/O errors, and the 2nd page * of mirror #2 is readable but the final checksum test fails, * then the 2nd page of mirror #3 could be tried, whether now - * the final checksum succeedes. But this would be a rare + * the final checksum succeeds. But this would be a rare * exception and is therefore not implemented. At least it is * avoided that the good copy is overwritten. * A more useful improvement would be to pick the sectors @@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bbio = bbio; recover->map_length = mapped_length; - BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); + BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); @@ -1504,8 +1504,9 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, sblock->no_io_error_seen = 0; } else { bio->bi_iter.bi_sector = page->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (btrfsic_submit_bio_wait(READ, bio)) + if (btrfsic_submit_bio_wait(bio)) sblock->no_io_error_seen = 0; } @@ -1583,6 +1584,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; bio->bi_bdev = page_bad->dev->bdev; bio->bi_iter.bi_sector = page_bad->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); if (PAGE_SIZE != ret) { @@ -1590,7 +1592,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - if (btrfsic_submit_bio_wait(WRITE, bio)) { + if (btrfsic_submit_bio_wait(bio)) { btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_replace_stats_inc( @@ -1684,6 +1686,7 @@ again: bio->bi_end_io = scrub_wr_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical_for_dev_replace || @@ -1731,7 +1734,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) * orders the requests before sending them to the driver which * doubled the write performance on spinning disks when measured * with Linux 3.5 */ - btrfsic_submit_bio(WRITE, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static void scrub_wr_bio_end_io(struct bio *bio) @@ -2041,7 +2044,7 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(READ, sbio->bio); + btrfsic_submit_bio(sbio->bio); } static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, @@ -2088,6 +2091,7 @@ again: bio->bi_end_io = scrub_bio_end_io; bio->bi_bdev = sbio->dev->bdev; bio->bi_iter.bi_sector = sbio->physical >> 9; + bio_set_op_attrs(bio, REQ_OP_READ, 0); sbio->err = 0; } else if (sbio->physical + sbio->page_count * PAGE_SIZE != spage->physical || @@ -2127,6 +2131,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio) if (bio->bi_error) sblock->no_io_error_seen = 0; + bio_put(bio); + btrfs_queue_work(fs_info->scrub_workers, &sblock->work); } @@ -2179,7 +2185,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; u64 length = sblock->page_count * PAGE_SIZE; u64 logical = sblock->pagev[0]->logical; - struct btrfs_bio *bbio; + struct btrfs_bio *bbio = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; int ret; @@ -2860,7 +2866,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nsectors = map->stripe_len / root->sectorsize; + nsectors = div_u64(map->stripe_len, root->sectorsize); bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, GFP_NOFS); @@ -2980,6 +2986,7 @@ again: extent_len); mapped_length = extent_len; + bbio = NULL; ret = btrfs_map_block(fs_info, READ, extent_logical, &mapped_length, &bbio, 0); if (!ret) { @@ -3070,7 +3077,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int slot; u64 nstripes; struct extent_buffer *l; - struct btrfs_key key; u64 physical; u64 logical; u64 logic_end; @@ -3079,7 +3085,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int mirror_num; struct reada_control *reada1; struct reada_control *reada2; - struct btrfs_key key_start; + struct btrfs_key key; struct btrfs_key key_end; u64 increment = map->stripe_len; u64 offset; @@ -3158,21 +3164,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, scrub_blocked_if_needed(fs_info); /* FIXME it might be better to start readahead at commit root */ - key_start.objectid = logical; - key_start.type = BTRFS_EXTENT_ITEM_KEY; - key_start.offset = (u64)0; + key.objectid = logical; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)0; key_end.objectid = logic_end; key_end.type = BTRFS_METADATA_ITEM_KEY; key_end.offset = (u64)-1; - reada1 = btrfs_reada_add(root, &key_start, &key_end); + reada1 = btrfs_reada_add(root, &key, &key_end); - key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_start.type = BTRFS_EXTENT_CSUM_KEY; - key_start.offset = logical; + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = logical; key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; key_end.type = BTRFS_EXTENT_CSUM_KEY; key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); + reada2 = btrfs_reada_add(csum_root, &key, &key_end); if (!IS_ERR(reada1)) btrfs_reada_wait(reada1); @@ -3580,6 +3586,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(root, cache); + if (!ret && is_dev_replace) { + /* + * If we are doing a device replace wait for any tasks + * that started dellaloc right before we set the block + * group to RO mode, as they might have just allocated + * an extent from it or decided they could do a nocow + * write. And if any such tasks did that, wait for their + * ordered extents to complete and then commit the + * current transaction, so that we can later see the new + * extent items in the extent tree - the ordered extents + * create delayed data references (for cow writes) when + * they complete, which will be run and insert the + * corresponding extent items into the extent tree when + * we commit the transaction they used when running + * inode.c:btrfs_finish_ordered_io(). We later use + * the commit root of the extent tree to find extents + * to copy from the srcdev into the tgtdev, and we don't + * want to miss any new extents. + */ + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + ret = btrfs_wait_ordered_roots(fs_info, -1, + cache->key.objectid, + cache->key.offset); + if (ret > 0) { + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + ret = PTR_ERR(trans); + else + ret = btrfs_commit_transaction(trans, + root); + if (ret) { + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + } scrub_pause_off(fs_info); if (ret == 0) { @@ -3600,9 +3646,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache, is_dev_replace); @@ -3638,6 +3686,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); + if (ro_set) btrfs_dec_block_group_ro(root, cache); @@ -3675,9 +3728,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = -ENOMEM; break; } - - dev_replace->cursor_left = dev_replace->cursor_right; - dev_replace->item_needs_writeback = 1; skip: key.offset = found_key.offset + length; btrfs_release_path(path); @@ -3735,27 +3785,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("scrub", flags, + btrfs_alloc_workqueue(fs_info, "scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("scrubwrc", flags, + btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("scrubnc", flags, 1, 0); + btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("scrubparity", flags, + btrfs_alloc_workqueue(fs_info, "scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; @@ -3810,7 +3860,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ - btrfs_err(fs_info, + btrfs_err_rl(fs_info, "scrub: size assumption sectorsize != PAGE_SIZE " "(%d != %lu) fails", fs_info->chunk_root->sectorsize, PAGE_SIZE); @@ -4390,6 +4440,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { leave_with_eio: @@ -4398,7 +4449,7 @@ leave_with_eio: return -EIO; } - if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) + if (btrfsic_submit_bio_wait(bio)) goto leave_with_eio; bio_put(bio); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8d358c547c59..a87675ffd02b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -231,7 +231,6 @@ struct pending_dir_move { u64 parent_ino; u64 ino; u64 gen; - bool is_orphan; struct list_head update_refs; }; @@ -274,6 +273,39 @@ struct name_cache_entry { char name[]; }; +static void inconsistent_snapshot_error(struct send_ctx *sctx, + enum btrfs_compare_tree_result result, + const char *what) +{ + const char *result_string; + + switch (result) { + case BTRFS_COMPARE_TREE_NEW: + result_string = "new"; + break; + case BTRFS_COMPARE_TREE_DELETED: + result_string = "deleted"; + break; + case BTRFS_COMPARE_TREE_CHANGED: + result_string = "updated"; + break; + case BTRFS_COMPARE_TREE_SAME: + ASSERT(0); + result_string = "unchanged"; + break; + default: + ASSERT(0); + result_string = "unexpected"; + } + + btrfs_err(sctx->send_root->fs_info, + "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu", + result_string, what, sctx->cmp_key->objectid, + sctx->send_root->root_key.objectid, + (sctx->parent_root ? + sctx->parent_root->root_key.objectid : 0)); +} + static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); static struct waiting_dir_move * @@ -1831,7 +1863,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, /* * If we have a parent root we need to verify that the parent dir was - * not delted and then re-created, if it was then we have no overwrite + * not deleted and then re-created, if it was then we have no overwrite * and we can just unlink this entry. */ if (sctx->parent_root) { @@ -1861,7 +1893,8 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, * was already unlinked/moved, so we can safely assume that we will not * overwrite anything at this point in time. */ - if (other_inode > sctx->send_progress) { + if (other_inode > sctx->send_progress || + is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, who_gen, NULL, NULL, NULL, NULL); if (ret < 0) @@ -2502,6 +2535,8 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); + if (ret > 0) + ret = -ENOENT; if (ret < 0) goto out; @@ -2947,6 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, } if (loc.objectid > send_progress) { + struct orphan_dir_info *odi; + + odi = get_orphan_dir_info(sctx, dir); + free_orphan_dir_info(sctx, odi); ret = 0; goto out; } @@ -3047,7 +3086,6 @@ static int add_pending_dir_move(struct send_ctx *sctx, pm->parent_ino = parent_ino; pm->ino = ino; pm->gen = ino_gen; - pm->is_orphan = is_orphan; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); @@ -3113,6 +3151,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } +static int path_loop(struct send_ctx *sctx, struct fs_path *name, + u64 ino, u64 gen, u64 *ancestor_ino) +{ + int ret = 0; + u64 parent_inode = 0; + u64 parent_gen = 0; + u64 start_ino = ino; + + *ancestor_ino = 0; + while (ino != BTRFS_FIRST_FREE_OBJECTID) { + fs_path_reset(name); + + if (is_waiting_for_rm(sctx, ino)) + break; + if (is_waiting_for_move(sctx, ino)) { + if (*ancestor_ino == 0) + *ancestor_ino = ino; + ret = get_first_ref(sctx->parent_root, ino, + &parent_inode, &parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + &parent_inode, + &parent_gen, name); + if (ret > 0) { + ret = 0; + break; + } + } + if (ret < 0) + break; + if (parent_inode == start_ino) { + ret = 1; + if (*ancestor_ino == 0) + *ancestor_ino = ino; + break; + } + ino = parent_inode; + gen = parent_gen; + } + return ret; +} + static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3123,6 +3203,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) u64 parent_ino, parent_gen; struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; + u64 ancestor; + bool is_orphan; int ret; name = fs_path_alloc(); @@ -3135,9 +3217,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) dm = get_waiting_dir_move(sctx, pm->ino); ASSERT(dm); rmdir_ino = dm->rmdir_ino; + is_orphan = dm->orphanized; free_waiting_dir_move(sctx, dm); - if (pm->is_orphan) { + if (is_orphan) { ret = gen_unique_name(sctx, pm->ino, pm->gen, from_path); } else { @@ -3155,6 +3238,24 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) goto out; sctx->send_progress = sctx->cur_ino + 1; + ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); + if (ret < 0) + goto out; + if (ret) { + LIST_HEAD(deleted_refs); + ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); + ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, + &pm->update_refs, &deleted_refs, + is_orphan); + if (ret < 0) + goto out; + if (rmdir_ino) { + dm = get_waiting_dir_move(sctx, pm->ino); + ASSERT(dm); + dm->rmdir_ino = rmdir_ino; + } + goto out; + } fs_path_reset(name); to_path = name; name = NULL; @@ -3174,7 +3275,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) /* already deleted */ goto finish; } - ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1); + ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino); if (ret < 0) goto out; if (!ret) @@ -3204,8 +3305,18 @@ finish: * and old parent(s). */ list_for_each_entry(cur, &pm->update_refs, list) { - if (cur->dir == rmdir_ino) + /* + * The parent inode might have been deleted in the send snapshot + */ + ret = get_inode_info(sctx->send_root, cur->dir, NULL, + NULL, NULL, NULL, NULL, NULL); + if (ret == -ENOENT) { + ret = 0; continue; + } + if (ret < 0) + goto out; + ret = send_utimes(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; @@ -3325,6 +3436,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, u64 left_gen; u64 right_gen; int ret = 0; + struct waiting_dir_move *wdm; if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) return 0; @@ -3383,7 +3495,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - if (is_waiting_for_move(sctx, di_key.objectid)) { + wdm = get_waiting_dir_move(sctx, di_key.objectid); + if (wdm && !wdm->orphanized) { ret = add_pending_dir_move(sctx, sctx->cur_ino, sctx->cur_inode_gen, @@ -3470,7 +3583,8 @@ static int wait_for_parent_move(struct send_ctx *sctx, ret = is_ancestor(sctx->parent_root, sctx->cur_ino, sctx->cur_inode_gen, ino, path_before); - break; + if (ret) + break; } fs_path_reset(path_before); @@ -3643,11 +3757,26 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); goto out; if (ret) { struct name_cache_entry *nce; + struct waiting_dir_move *wdm; ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) goto out; + + /* + * If ow_inode has its rename operation delayed + * make sure that its orphanized name is used in + * the source path when performing its rename + * operation. + */ + if (is_waiting_for_move(sctx, ow_inode)) { + wdm = get_waiting_dir_move(sctx, + ow_inode); + ASSERT(wdm); + wdm->orphanized = true; + } + /* * Make sure we clear our orphanized inode's * name from the name cache. This is because the @@ -3663,6 +3792,19 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); name_cache_delete(sctx, nce); kfree(nce); } + + /* + * ow_inode might currently be an ancestor of + * cur_ino, therefore compute valid_path (the + * current path of cur_ino) again because it + * might contain the pre-orphanization name of + * ow_inode, which is no longer valid. + */ + fs_path_reset(valid_path); + ret = get_cur_path(sctx, sctx->cur_ino, + sctx->cur_inode_gen, valid_path); + if (ret < 0) + goto out; } else { ret = send_unlink(sctx, cur->full_path); if (ret < 0) @@ -4126,10 +4268,12 @@ static int process_all_refs(struct send_ctx *sctx, } btrfs_release_path(path); + /* + * We don't actually care about pending_move as we are simply + * re-creating this inode and will be rename'ing it into place once we + * rename the parent directory. + */ ret = process_recorded_refs(sctx, &pending_move); - /* Only applicable to an incremental send. */ - ASSERT(pending_move == 0); - out: btrfs_free_path(path); return ret; @@ -4192,9 +4336,9 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, return -ENOMEM; /* - * This hack is needed because empty acl's are stored as zero byte + * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte - * acl's will fail later. To fix this, we send a dummy acl list that + * acls will fail later. To fix this, we send a dummy acl list that * only contains the version number and no entries. */ if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) || @@ -5602,7 +5746,10 @@ static int changed_ref(struct send_ctx *sctx, { int ret = 0; - BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "reference"); + return -EIO; + } if (!sctx->cur_inode_new_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) { @@ -5627,7 +5774,10 @@ static int changed_xattr(struct send_ctx *sctx, { int ret = 0; - BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "xattr"); + return -EIO; + } if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result == BTRFS_COMPARE_TREE_NEW) @@ -5651,7 +5801,10 @@ static int changed_extent(struct send_ctx *sctx, { int ret = 0; - BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid); + if (sctx->cur_ino != sctx->cmp_key->objectid) { + inconsistent_snapshot_error(sctx, result, "extent"); + return -EIO; + } if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) { if (result != BTRFS_COMPARE_TREE_DELETED) @@ -5939,6 +6092,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; + unsigned alloc_size; int sort_clone_roots = 0; int index; @@ -5978,6 +6132,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) goto out; } + if (arg->clone_sources_count > + ULLONG_MAX / sizeof(*arg->clone_sources)) { + ret = -EINVAL; + goto out; + } + if (!access_ok(VERIFY_READ, arg->clone_sources, sizeof(*arg->clone_sources) * arg->clone_sources_count)) { @@ -6022,40 +6182,53 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = vmalloc(sctx->send_max_size); + sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } } - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; + sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); + if (!sctx->read_buf) { + ret = -ENOMEM; + goto out; + } } sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - sctx->clone_roots = vzalloc(sizeof(struct clone_root) * - (arg->clone_sources_count + 1)); + alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); + + sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!sctx->clone_roots) { - ret = -ENOMEM; - goto out; + sctx->clone_roots = vzalloc(alloc_size); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; + } } + alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + if (arg->clone_sources_count) { - clone_sources_tmp = vmalloc(arg->clone_sources_count * - sizeof(*arg->clone_sources)); + clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; + clone_sources_tmp = vmalloc(alloc_size); + if (!clone_sources_tmp) { + ret = -ENOMEM; + goto out; + } } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, - arg->clone_sources_count * - sizeof(*arg->clone_sources)); + alloc_size); if (ret) { ret = -EFAULT; goto out; @@ -6089,7 +6262,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots[i].root = clone_root; clone_sources_to_rollback = i + 1; } - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); clone_sources_tmp = NULL; } @@ -6207,15 +6380,15 @@ out: btrfs_root_dec_send_in_progress(sctx->parent_root); kfree(arg); - vfree(clone_sources_tmp); + kvfree(clone_sources_tmp); if (sctx) { if (sctx->send_filp) fput(sctx->send_filp); - vfree(sctx->clone_roots); - vfree(sctx->send_buf); - vfree(sctx->read_buf); + kvfree(sctx->clone_roots); + kvfree(sctx->send_buf); + kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index e05619f241be..875c757e73e2 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -36,7 +36,7 @@ static inline void put_unaligned_le8(u8 val, void *p) * * The end result is that anyone who #includes ctree.h gets a * declaration for the btrfs_set_foo functions and btrfs_foo functions, - * which are wappers of btrfs_set_token_#bits functions and + * which are wrappers of btrfs_set_token_#bits functions and * btrfs_get_token_#bits functions, which are defined in this file. * * These setget functions do all the extent_buffer related mapping diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 00b8f37cc306..4071fe2bd098 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -97,15 +97,6 @@ const char *btrfs_decode_error(int errno) return errstr; } -static void save_error_info(struct btrfs_fs_info *fs_info) -{ - /* - * today we only save the error info into ram. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); -} - /* btrfs handle error by forcing the filesystem readonly */ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) { @@ -121,7 +112,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * Note that a running device replace operation is not * canceled here although there is no way to update * the progress. It would add the risk of a deadlock, - * therefore the canceling is ommited. The only penalty + * therefore the canceling is omitted. The only penalty * is that some I/O remains active until the procedure * completes. The next time when the filesystem is * mounted writeable again, the device replace @@ -131,11 +122,11 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } /* - * __btrfs_std_error decodes expected errors from the caller and + * __btrfs_handle_fs_error decodes expected errors from the caller and * invokes the approciate error response. */ __cold -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -170,8 +161,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, } #endif + /* + * Today we only save the error info to memory. Long term we'll + * also send it down to the disk + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + /* Don't go through full error handling during mount */ - save_error_info(fs_info); if (sb->s_flags & MS_BORN) btrfs_handle_error(fs_info); } @@ -188,6 +184,22 @@ static const char * const logtypes[] = { "debug", }; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; @@ -196,6 +208,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) va_list args; const char *type = logtypes[4]; int kern_level; + struct ratelimit_state *ratelimit; va_start(args, fmt); @@ -206,13 +219,18 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) lvl[size] = '\0'; fmt += size; type = logtypes[kern_level - '0']; - } else + ratelimit = &printk_limits[kern_level - '0']; + } else { *lvl = '\0'; + /* Default to debug output */ + ratelimit = &printk_limits[7]; + } vaf.fmt = fmt; vaf.va = &args; - printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); + if (__ratelimit(ratelimit)) + printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf); va_end(args); } @@ -233,26 +251,28 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) */ __cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *function, + const char *function, unsigned int line, int errno) { + struct btrfs_fs_info *fs_info = trans->fs_info; + trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ - if (!trans->blocks_used && list_empty(&trans->new_bgs)) { + if (!trans->dirty && list_empty(&trans->new_bgs)) { const char *errstr; errstr = btrfs_decode_error(errno); - btrfs_warn(root->fs_info, + btrfs_warn(fs_info, "%s:%d: Aborting unused transaction(%s).", function, line, errstr); return; } ACCESS_ONCE(trans->transaction->aborted) = errno; /* Wake up anybody who may be waiting on this transaction */ - wake_up(&root->fs_info->transaction_wait); - wake_up(&root->fs_info->transaction_blocked_wait); - __btrfs_std_error(root->fs_info, function, line, errno, NULL); + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); } /* * __btrfs_panic decodes unexpected, fatal errors from the caller, @@ -436,12 +456,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ break; case Opt_nodatasum: - btrfs_set_and_info(root, NODATASUM, + btrfs_set_and_info(info, NODATASUM, "setting nodatasum"); break; case Opt_datasum: - if (btrfs_test_opt(root, NODATASUM)) { - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATASUM)) { + if (btrfs_test_opt(info, NODATACOW)) btrfs_info(root->fs_info, "setting datasum, datacow enabled"); else btrfs_info(root->fs_info, "setting datasum"); @@ -450,9 +470,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: - if (!btrfs_test_opt(root, NODATACOW)) { - if (!btrfs_test_opt(root, COMPRESS) || - !btrfs_test_opt(root, FORCE_COMPRESS)) { + if (!btrfs_test_opt(info, NODATACOW)) { + if (!btrfs_test_opt(info, COMPRESS) || + !btrfs_test_opt(info, FORCE_COMPRESS)) { btrfs_info(root->fs_info, "setting nodatacow, compression disabled"); } else { @@ -465,7 +485,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_datacow: - btrfs_clear_and_info(root, NODATACOW, + btrfs_clear_and_info(info, NODATACOW, "setting datacow"); break; case Opt_compress_force: @@ -474,10 +494,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, /* Fallthrough */ case Opt_compress: case Opt_compress_type: - saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + saved_compress_type = btrfs_test_opt(info, + COMPRESS) ? info->compress_type : BTRFS_COMPRESS_NONE; saved_compress_force = - btrfs_test_opt(root, FORCE_COMPRESS); + btrfs_test_opt(info, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -517,10 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } - if ((btrfs_test_opt(root, COMPRESS) && + if ((btrfs_test_opt(info, COMPRESS) && (info->compress_type != saved_compress_type || compress_force != saved_compress_force)) || - (!btrfs_test_opt(root, COMPRESS) && + (!btrfs_test_opt(info, COMPRESS) && no_compress == 1)) { btrfs_info(root->fs_info, "%s %s compression", @@ -530,25 +551,25 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, compress_force = false; break; case Opt_ssd: - btrfs_set_and_info(root, SSD, + btrfs_set_and_info(info, SSD, "use ssd allocation scheme"); break; case Opt_ssd_spread: - btrfs_set_and_info(root, SSD_SPREAD, + btrfs_set_and_info(info, SSD_SPREAD, "use spread ssd allocation scheme"); btrfs_set_opt(info->mount_opt, SSD); break; case Opt_nossd: - btrfs_set_and_info(root, NOSSD, + btrfs_set_and_info(info, NOSSD, "not using ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, SSD); break; case Opt_barrier: - btrfs_clear_and_info(root, NOBARRIER, + btrfs_clear_and_info(info, NOBARRIER, "turning on barriers"); break; case Opt_nobarrier: - btrfs_set_and_info(root, NOBARRIER, + btrfs_set_and_info(info, NOBARRIER, "turning off barriers"); break; case Opt_thread_pool: @@ -608,24 +629,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: - btrfs_set_and_info(root, NOTREELOG, + btrfs_set_and_info(info, NOTREELOG, "disabling tree log"); break; case Opt_treelog: - btrfs_clear_and_info(root, NOTREELOG, + btrfs_clear_and_info(info, NOTREELOG, "enabling tree log"); break; case Opt_norecovery: case Opt_nologreplay: - btrfs_set_and_info(root, NOLOGREPLAY, + btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; case Opt_flushoncommit: - btrfs_set_and_info(root, FLUSHONCOMMIT, + btrfs_set_and_info(info, FLUSHONCOMMIT, "turning on flush-on-commit"); break; case Opt_noflushoncommit: - btrfs_clear_and_info(root, FLUSHONCOMMIT, + btrfs_clear_and_info(info, FLUSHONCOMMIT, "turning off flush-on-commit"); break; case Opt_ratio: @@ -642,11 +663,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, } break; case Opt_discard: - btrfs_set_and_info(root, DISCARD, + btrfs_set_and_info(info, DISCARD, "turning on discard"); break; case Opt_nodiscard: - btrfs_clear_and_info(root, DISCARD, + btrfs_clear_and_info(info, DISCARD, "turning off discard"); break; case Opt_space_cache: @@ -655,12 +676,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, strcmp(args[0].from, "v1") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_set_and_info(root, SPACE_CACHE, + btrfs_set_and_info(info, SPACE_CACHE, "enabling disk space caching"); } else if (strcmp(args[0].from, "v2") == 0) { btrfs_clear_opt(root->fs_info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(root, FREE_SPACE_TREE, + btrfs_set_and_info(info, + FREE_SPACE_TREE, "enabling free space tree"); } else { ret = -EINVAL; @@ -671,12 +693,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - if (btrfs_test_opt(root, SPACE_CACHE)) { - btrfs_clear_and_info(root, SPACE_CACHE, + if (btrfs_test_opt(info, SPACE_CACHE)) { + btrfs_clear_and_info(info, + SPACE_CACHE, "disabling disk space caching"); } - if (btrfs_test_opt(root, FREE_SPACE_TREE)) { - btrfs_clear_and_info(root, FREE_SPACE_TREE, + if (btrfs_test_opt(info, FREE_SPACE_TREE)) { + btrfs_clear_and_info(info, + FREE_SPACE_TREE, "disabling free space tree"); } break; @@ -689,7 +713,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, "disabling inode map caching"); break; case Opt_clear_cache: - btrfs_set_and_info(root, CLEAR_CACHE, + btrfs_set_and_info(info, CLEAR_CACHE, "force clearing of disk cache"); break; case Opt_user_subvol_rm_allowed: @@ -702,11 +726,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); break; case Opt_defrag: - btrfs_set_and_info(root, AUTO_DEFRAG, + btrfs_set_and_info(info, AUTO_DEFRAG, "enabling auto defrag"); break; case Opt_nodefrag: - btrfs_clear_and_info(root, AUTO_DEFRAG, + btrfs_clear_and_info(info, AUTO_DEFRAG, "disabling auto defrag"); break; case Opt_recovery: @@ -814,22 +838,22 @@ check: /* * Extra check for current option against current flag */ - if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { + if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { btrfs_err(root->fs_info, "nologreplay must be used with ro mount option"); ret = -EINVAL; } out: if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && - !btrfs_test_opt(root, FREE_SPACE_TREE) && - !btrfs_test_opt(root, CLEAR_CACHE)) { + !btrfs_test_opt(info, FREE_SPACE_TREE) && + !btrfs_test_opt(info, CLEAR_CACHE)) { btrfs_err(root->fs_info, "cannot disable free space tree"); ret = -EINVAL; } - if (!ret && btrfs_test_opt(root, SPACE_CACHE)) + if (!ret && btrfs_test_opt(info, SPACE_CACHE)) btrfs_info(root->fs_info, "disk space caching is enabled"); - if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE)) + if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) btrfs_info(root->fs_info, "using free space tree"); kfree(orig); return ret; @@ -1153,14 +1177,14 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - trace_btrfs_sync_fs(wait); + trace_btrfs_sync_fs(fs_info, wait); if (!wait) { filemap_flush(fs_info->btree_inode->i_mapping); return 0; } - btrfs_wait_ordered_roots(fs_info, -1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { @@ -1196,13 +1220,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) struct btrfs_root *root = info->tree_root; char *compress_type; - if (btrfs_test_opt(root, DEGRADED)) + if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); - if (btrfs_test_opt(root, NODATASUM)) + if (btrfs_test_opt(info, NODATASUM)) seq_puts(seq, ",nodatasum"); - if (btrfs_test_opt(root, NODATACOW)) + if (btrfs_test_opt(info, NODATACOW)) seq_puts(seq, ",nodatacow"); - if (btrfs_test_opt(root, NOBARRIER)) + if (btrfs_test_opt(info, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); @@ -1211,56 +1235,56 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); - if (btrfs_test_opt(root, COMPRESS)) { + if (btrfs_test_opt(info, COMPRESS)) { if (info->compress_type == BTRFS_COMPRESS_ZLIB) compress_type = "zlib"; else compress_type = "lzo"; - if (btrfs_test_opt(root, FORCE_COMPRESS)) + if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else seq_printf(seq, ",compress=%s", compress_type); } - if (btrfs_test_opt(root, NOSSD)) + if (btrfs_test_opt(info, NOSSD)) seq_puts(seq, ",nossd"); - if (btrfs_test_opt(root, SSD_SPREAD)) + if (btrfs_test_opt(info, SSD_SPREAD)) seq_puts(seq, ",ssd_spread"); - else if (btrfs_test_opt(root, SSD)) + else if (btrfs_test_opt(info, SSD)) seq_puts(seq, ",ssd"); - if (btrfs_test_opt(root, NOTREELOG)) + if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); - if (btrfs_test_opt(root, NOLOGREPLAY)) + if (btrfs_test_opt(info, NOLOGREPLAY)) seq_puts(seq, ",nologreplay"); - if (btrfs_test_opt(root, FLUSHONCOMMIT)) + if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); - if (btrfs_test_opt(root, DISCARD)) + if (btrfs_test_opt(info, DISCARD)) seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(info, SPACE_CACHE)) seq_puts(seq, ",space_cache"); - else if (btrfs_test_opt(root, FREE_SPACE_TREE)) + else if (btrfs_test_opt(info, FREE_SPACE_TREE)) seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); - if (btrfs_test_opt(root, RESCAN_UUID_TREE)) + if (btrfs_test_opt(info, RESCAN_UUID_TREE)) seq_puts(seq, ",rescan_uuid_tree"); - if (btrfs_test_opt(root, CLEAR_CACHE)) + if (btrfs_test_opt(info, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); - if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); - if (btrfs_test_opt(root, ENOSPC_DEBUG)) + if (btrfs_test_opt(info, ENOSPC_DEBUG)) seq_puts(seq, ",enospc_debug"); - if (btrfs_test_opt(root, AUTO_DEFRAG)) + if (btrfs_test_opt(info, AUTO_DEFRAG)) seq_puts(seq, ",autodefrag"); - if (btrfs_test_opt(root, INODE_MAP_CACHE)) + if (btrfs_test_opt(info, INODE_MAP_CACHE)) seq_puts(seq, ",inode_cache"); - if (btrfs_test_opt(root, SKIP_BALANCE)) + if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) + if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA)) seq_puts(seq, ",check_int_data"); - else if (btrfs_test_opt(root, CHECK_INTEGRITY)) + else if (btrfs_test_opt(info, CHECK_INTEGRITY)) seq_puts(seq, ",check_int"); if (info->check_integrity_print_mask) seq_printf(seq, ",check_int_print_mask=%d", @@ -1269,14 +1293,14 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%d", info->metadata_ratio); - if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR)) + if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); #ifdef CONFIG_BTRFS_DEBUG - if (btrfs_test_opt(root, FRAGMENT_DATA)) + if (btrfs_test_opt(info, FRAGMENT_DATA)) seq_puts(seq, ",fragment=data"); - if (btrfs_test_opt(root, FRAGMENT_METADATA)) + if (btrfs_test_opt(info, FRAGMENT_METADATA)) seq_puts(seq, ",fragment=metadata"); #endif seq_printf(seq, ",subvolid=%llu", @@ -1488,10 +1512,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); } else { /* - * Since SELinux(the only one supports security_mnt_opts) does - * NOT support changing context during remount/mount same sb, - * This must be the same or part of the same security options, - * just free it. + * Since SELinux (the only one supporting security_mnt_opts) + * does NOT support changing context during remount/mount of + * the same sb, this must be the same or part of the same + * security options, just free it. */ security_free_mnt_opts(sec_opts); } @@ -1669,8 +1693,8 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, unsigned long old_opts) { /* - * We need cleanup all defragable inodes if the autodefragment is - * close or the fs is R/O. + * We need to cleanup all defragable inodes if the autodefragment is + * close or the filesystem is read only. */ if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || @@ -1811,6 +1835,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } } sb->s_flags &= ~MS_RDONLY; + + fs_info->open = 1; } out: wake_up_process(fs_info->transaction_kthread); @@ -1881,7 +1907,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int ret; /* - * We aren't under the device list lock, so this is racey-ish, but good + * We aren't under the device list lock, so this is racy-ish, but good * enough for our purposes. */ nr_devices = fs_info->fs_devices->open_devices; @@ -1900,7 +1926,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) if (!devices_info) return -ENOMEM; - /* calc min stripe number for data space alloction */ + /* calc min stripe number for data space allocation */ type = btrfs_get_alloc_profile(root, 1); if (type & BTRFS_BLOCK_GROUP_RAID0) { min_stripes = 2; @@ -1936,7 +1962,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) avail_space *= BTRFS_STRIPE_LEN; /* - * In order to avoid overwritting the superblock on the drive, + * In order to avoid overwriting the superblock on the drive, * btrfs starts at an offset of at least 1MB when doing chunk * allocation. */ @@ -2032,9 +2058,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * chunk). * * If metadata is exhausted, f_bavail will be 0. - * - * FIXME: not accurate for mixed block groups, total and free/used are ok, - * available appears slightly larger. */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -2051,9 +2074,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; u64 thresh = 0; + int mixed = 0; /* - * holding chunk_muext to avoid allocating new chunks, holding + * holding chunk_mutex to avoid allocating new chunks, holding * device_list_mutex to avoid the device being removed */ rcu_read_lock(); @@ -2076,8 +2100,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } - if (found->flags & BTRFS_BLOCK_GROUP_METADATA) - total_free_meta += found->disk_total - found->disk_used; + + /* + * Metadata in mixed block goup profiles are accounted in data + */ + if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (found->flags & BTRFS_BLOCK_GROUP_DATA) + mixed = 1; + else + total_free_meta += found->disk_total - + found->disk_used; + } total_used += found->disk_used; } @@ -2090,7 +2123,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) /* Account global block reserve as used, it's in logical size already */ spin_lock(&block_rsv->lock); - buf->f_bfree -= block_rsv->size >> bits; + /* Mixed block groups accounting is not byte-accurate, avoid overflow */ + if (buf->f_bfree >= block_rsv->size >> bits) + buf->f_bfree -= block_rsv->size >> bits; + else + buf->f_bfree = 0; spin_unlock(&block_rsv->lock); buf->f_bavail = div_u64(total_free_data, factor); @@ -2115,7 +2152,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = 4 * 1024 * 1024; - if (total_free_meta - thresh < block_rsv->size) + if (!mixed && total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; @@ -2204,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb) struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_sb(sb)->tree_root; + root->fs_info->fs_frozen = 1; + /* + * We don't need a barrier here, we'll wait for any transaction that + * could be in progress on other threads (and do delayed iputs that + * we want to avoid on a frozen filesystem), or do the commit + * ourselves. + */ trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ @@ -2214,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans, root); } +static int btrfs_unfreeze(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb)->tree_root; + + root->fs_info->fs_frozen = 0; + return 0; +} + static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -2262,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, + .unfreeze_fs = btrfs_unfreeze, }; static const struct file_operations btrfs_ctl_fops = { @@ -2293,7 +2346,7 @@ static void btrfs_interface_exit(void) static void btrfs_print_mod_info(void) { - printk(KERN_INFO "Btrfs loaded" + printk(KERN_INFO "Btrfs loaded, crc32c=%s" #ifdef CONFIG_BTRFS_DEBUG ", debug=on" #endif @@ -2303,36 +2356,8 @@ static void btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY ", integrity-checker=on" #endif - "\n"); -} - -static int btrfs_run_sanity_tests(void) -{ - int ret; - - ret = btrfs_init_test_fs(); - if (ret) - return ret; - - ret = btrfs_test_free_space_cache(); - if (ret) - goto out; - ret = btrfs_test_extent_buffer_operations(); - if (ret) - goto out; - ret = btrfs_test_extent_io(); - if (ret) - goto out; - ret = btrfs_test_inodes(); - if (ret) - goto out; - ret = btrfs_test_qgroups(); - if (ret) - goto out; - ret = btrfs_test_free_space_tree(); -out: - btrfs_destroy_test_fs(); - return ret; + "\n", + btrfs_crc32c_impl()); } static int __init init_btrfs_fs(void) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 539e7b5e3f86..c6569905d3d1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, if (!fs_info) return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + ret = kstrtoul(skip_spaces(buf), 0, &val); if (ret) return ret; @@ -323,6 +326,7 @@ SPACE_INFO_ATTR(bytes_used); SPACE_INFO_ATTR(bytes_pinned); SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); +SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); @@ -334,6 +338,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(bytes_pinned), BTRFS_ATTR_PTR(bytes_reserved), BTRFS_ATTR_PTR(bytes_may_use), + BTRFS_ATTR_PTR(bytes_readonly), BTRFS_ATTR_PTR(disk_used), BTRFS_ATTR_PTR(disk_total), BTRFS_ATTR_PTR(total_bytes_pinned), @@ -364,7 +369,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); char *label = fs_info->super_copy->label; - return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ssize_t ret; + + spin_lock(&fs_info->super_lock); + ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + spin_unlock(&fs_info->super_lock); + + return ret; } static ssize_t btrfs_label_store(struct kobject *kobj, @@ -374,6 +385,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); size_t p_len; + if (!fs_info) + return -EPERM; + if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index f54bf450bad3..bf62ad919a95 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -54,7 +54,7 @@ struct inode *btrfs_new_test_inode(void) return new_inode(test_mnt->mnt_sb); } -int btrfs_init_test_fs(void) +static int btrfs_init_test_fs(void) { int ret; @@ -68,12 +68,12 @@ int btrfs_init_test_fs(void) if (IS_ERR(test_mnt)) { printk(KERN_ERR "btrfs: cannot mount test file system\n"); unregister_filesystem(&test_type); - return ret; + return PTR_ERR(test_mnt); } return 0; } -void btrfs_destroy_test_fs(void) +static void btrfs_destroy_test_fs(void) { kern_unmount(test_mnt); unregister_filesystem(&test_type); @@ -128,14 +128,27 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) extent_io_tree_init(&fs_info->freed_extents[0], NULL); extent_io_tree_init(&fs_info->freed_extents[1], NULL); fs_info->pinned_extents = &fs_info->freed_extents[0]; + set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); + + test_mnt->mnt_sb->s_fs_info = fs_info; + return fs_info; } -static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { struct radix_tree_iter iter; void **slot; + if (!fs_info) + return; + + if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, + &fs_info->fs_state))) + return; + + test_mnt->mnt_sb->s_fs_info = NULL; + spin_lock(&fs_info->buffer_lock); radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { struct extent_buffer *eb; @@ -167,15 +180,16 @@ void btrfs_free_dummy_root(struct btrfs_root *root) { if (!root) return; + /* Will be freed by btrfs_free_fs_roots */ + if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state))) + return; if (root->node) free_extent_buffer(root->node); - if (root->fs_info) - btrfs_free_dummy_fs_info(root->fs_info); kfree(root); } struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length) +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize) { struct btrfs_block_group_cache *cache; @@ -192,8 +206,8 @@ btrfs_alloc_dummy_block_group(unsigned long length) cache->key.objectid = 0; cache->key.offset = length; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; - cache->full_stripe_len = 4096; + cache->sectorsize = sectorsize; + cache->full_stripe_len = sectorsize; INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -220,3 +234,46 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } + +int btrfs_run_sanity_tests(void) +{ + int ret, i; + u32 sectorsize, nodesize; + u32 test_sectorsize[] = { + PAGE_SIZE, + }; + ret = btrfs_init_test_fs(); + if (ret) + return ret; + for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) { + sectorsize = test_sectorsize[i]; + for (nodesize = sectorsize; + nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE; + nodesize <<= 1) { + pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n", + sectorsize, nodesize); + ret = btrfs_test_free_space_cache(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_buffer_operations(sectorsize, + nodesize); + if (ret) + goto out; + ret = btrfs_test_extent_io(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_inodes(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_qgroups(sectorsize, nodesize); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(sectorsize, nodesize); + if (ret) + goto out; + } + } +out: + btrfs_destroy_test_fs(); + return ret; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 054b8c73c951..b17ffbe8f9f3 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -20,56 +20,29 @@ #define __BTRFS_TESTS #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_run_sanity_tests(void); #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) struct btrfs_root; struct btrfs_trans_handle; -int btrfs_test_free_space_cache(void); -int btrfs_test_extent_buffer_operations(void); -int btrfs_test_extent_io(void); -int btrfs_test_inodes(void); -int btrfs_test_qgroups(void); -int btrfs_test_free_space_tree(void); -int btrfs_init_test_fs(void); -void btrfs_destroy_test_fs(void); +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize); +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize); +int btrfs_test_inodes(u32 sectorsize, u32 nodesize); +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length); +btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else -static inline int btrfs_test_free_space_cache(void) -{ - return 0; -} -static inline int btrfs_test_extent_buffer_operations(void) -{ - return 0; -} -static inline int btrfs_init_test_fs(void) -{ - return 0; -} -static inline void btrfs_destroy_test_fs(void) -{ -} -static inline int btrfs_test_extent_io(void) -{ - return 0; -} -static inline int btrfs_test_inodes(void) -{ - return 0; -} -static inline int btrfs_test_qgroups(void) -{ - return 0; -} -static inline int btrfs_test_free_space_tree(void) +static inline int btrfs_run_sanity_tests(void) { return 0; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index f51963a8f929..199569174637 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -22,10 +22,11 @@ #include "../extent_io.h" #include "../disk-io.h" -static int test_btrfs_split_item(void) +static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { - struct btrfs_path *path; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; + struct btrfs_path *path = NULL; + struct btrfs_root *root = NULL; struct extent_buffer *eb; struct btrfs_item *item; char *value = "mary had a little lamb"; @@ -40,20 +41,28 @@ static int test_btrfs_split_item(void) test_msg("Running btrfs_split_item tests\n"); - root = btrfs_alloc_dummy_root(); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Could not allocate fs_info\n"); + return -ENOMEM; + } + + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(root)) { test_msg("Could not allocate root\n"); - return PTR_ERR(root); + ret = PTR_ERR(root); + goto out; } path = btrfs_alloc_path(); if (!path) { test_msg("Could not allocate path\n"); - kfree(root); - return -ENOMEM; + ret = -ENOMEM; + goto out; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize, + nodesize); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; @@ -218,12 +227,13 @@ static int test_btrfs_split_item(void) } out: btrfs_free_path(path); - kfree(root); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -int btrfs_test_extent_buffer_operations(void) +int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize) { - test_msg("Running extent buffer operation tests"); - return test_btrfs_split_item(); + test_msg("Running extent buffer operation tests\n"); + return test_btrfs_split_item(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 70948b13bc81..d19ab0317283 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -21,6 +21,7 @@ #include <linux/slab.h> #include <linux/sizes.h> #include "btrfs-tests.h" +#include "../ctree.h" #include "../extent_io.h" #define PROCESS_UNLOCK (1 << 0) @@ -65,7 +66,7 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end, return count; } -static int test_find_delalloc(void) +static int test_find_delalloc(u32 sectorsize) { struct inode *inode; struct extent_io_tree tmp; @@ -113,7 +114,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -122,9 +123,9 @@ static int test_find_delalloc(void) test_msg("Should have found at least one delalloc\n"); goto out_bits; } - if (start != 0 || end != 4095) { - test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n", - start, end); + if (start != 0 || end != (sectorsize - 1)) { + test_msg("Expected start 0 end %u, got start %llu end %llu\n", + sectorsize - 1, start, end); goto out_bits; } unlock_extent(&tmp, start, end); @@ -144,7 +145,7 @@ static int test_find_delalloc(void) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -172,11 +173,11 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - test_start = max_bytes + 4096; + test_start = max_bytes + sectorsize; locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_SHIFT); if (!locked_page) { - test_msg("Could'nt find the locked page\n"); + test_msg("Couldn't find the locked page\n"); goto out_bits; } start = test_start; @@ -199,7 +200,7 @@ static int test_find_delalloc(void) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -262,7 +263,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); @@ -272,6 +273,16 @@ out: return ret; } +/** + * test_bit_in_byte - Determine whether a bit is set in a byte + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline int test_bit_in_byte(int nr, const u8 *addr) +{ + return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1))); +} + static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, unsigned long len) { @@ -298,25 +309,29 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return -EINVAL; } - bitmap_set(bitmap, (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Setting straddling pages failed\n"); - return -EINVAL; - } + /* Straddling pages test */ + if (len > PAGE_SIZE) { + bitmap_set(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting straddling pages failed\n"); + return -EINVAL; + } - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - bitmap_clear(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_msg("Clearing straddling pages failed\n"); - return -EINVAL; + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + bitmap_clear(bitmap, + (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing straddling pages failed\n"); + return -EINVAL; + } } /* @@ -333,7 +348,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, for (i = 0; i < len * BITS_PER_BYTE; i++) { int bit, bit1; - bit = !!test_bit(i, bitmap); + bit = !!test_bit_in_byte(i, (u8 *)bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { test_msg("Testing bit pattern failed\n"); @@ -351,15 +366,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, return 0; } -static int test_eb_bitmaps(void) +static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { - unsigned long len = PAGE_SIZE * 4; + unsigned long len; unsigned long *bitmap; struct extent_buffer *eb; int ret; test_msg("Running extent buffer bitmap tests\n"); + /* + * In ppc64, sectorsize can be 64K, thus 4 * 64K will be larger than + * BTRFS_MAX_METADATA_BLOCKSIZE. + */ + len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE) + ? sectorsize * 4 : sectorsize; + bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_msg("Couldn't allocate test bitmap\n"); @@ -379,7 +401,7 @@ static int test_eb_bitmaps(void) /* Do it over again with an extent buffer which isn't page-aligned. */ free_extent_buffer(eb); - eb = __alloc_dummy_extent_buffer(NULL, PAGE_SIZE / 2, len); + eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len); if (!eb) { test_msg("Couldn't allocate test extent buffer\n"); kfree(bitmap); @@ -393,17 +415,17 @@ out: return ret; } -int btrfs_test_extent_io(void) +int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; test_msg("Running extent I/O tests\n"); - ret = test_find_delalloc(); + ret = test_find_delalloc(sectorsize); if (ret) goto out; - ret = test_eb_bitmaps(); + ret = test_eb_bitmaps(sectorsize, nodesize); out: test_msg("Extent I/O tests finished\n"); return ret; diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 514247515312..3221c8dee272 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -22,10 +22,10 @@ #include "../disk-io.h" #include "../free-space-cache.h" -#define BITS_PER_BITMAP (PAGE_SIZE * 8) +#define BITS_PER_BITMAP (PAGE_SIZE * 8UL) /* - * This test just does basic sanity checking, making sure we can add an exten + * This test just does basic sanity checking, making sure we can add an extent * entry and remove space from either end and the middle, and make sure we can * remove space that covers adjacent extent entries. */ @@ -99,7 +99,8 @@ static int test_extents(struct btrfs_block_group_cache *cache) return 0; } -static int test_bitmaps(struct btrfs_block_group_cache *cache) +static int test_bitmaps(struct btrfs_block_group_cache *cache, + u32 sectorsize) { u64 next_bitmap_offset; int ret; @@ -139,7 +140,7 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) * The first bitmap we have starts at offset 0 so the next one is just * at the end of the first bitmap. */ - next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + next_bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); /* Test a bit straddling two bitmaps */ ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, @@ -167,9 +168,10 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) } /* This is the high grade jackassery */ -static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) +static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache, + u32 sectorsize) { - u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); + u64 bitmap_offset = (u64)(BITS_PER_BITMAP * sectorsize); int ret; test_msg("Running bitmap and extent tests\n"); @@ -396,11 +398,13 @@ static int check_cache_empty(struct btrfs_block_group_cache *cache) * wasn't optimal as they could be spread all over the block group while under * concurrency (extra overhead and fragmentation). * - * This stealing approach is benefical, since we always prefer to allocate from - * extent entries, both for clustered and non-clustered allocation requests. + * This stealing approach is beneficial, since we always prefer to allocate + * from extent entries, both for clustered and non-clustered allocation + * requests. */ static int -test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache, + u32 sectorsize) { int ret; u64 offset; @@ -538,7 +542,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096); + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -596,8 +600,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) { - test_msg("Cache free space is not 1Mb + 4Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", sectorsize); return -EINVAL; } @@ -610,22 +614,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 4Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is a sectorsize free space region in a bitmap. + * Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 4096) { - test_msg("Cache free space is not 4Kb\n"); + if (cache->free_space_ctl->free_space != sectorsize) { + test_msg("Cache free space is not %u\n", sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 4096, 0, + 0, sectorsize, 0, &max_extent_size); if (offset != (SZ_128M + SZ_16M)) { - test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", - offset); + test_msg("Failed to allocate %u, returned offset : %llu\n", + sectorsize, offset); return -EINVAL; } @@ -732,7 +739,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, SZ_32M, 8192); + ret = btrfs_add_free_space(cache, SZ_32M, 2 * sectorsize); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -756,7 +763,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) /* * Confirm that our extent entry didn't stole all free space from the - * bitmap, because of the small 8Kb free space region. + * bitmap, because of the small 2 * sectorsize free space region. */ ret = check_num_extents_and_bitmaps(cache, 2, 1); if (ret) @@ -782,8 +789,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -ENOENT; } - if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) { - test_msg("Cache free space is not 1Mb + 8Kb\n"); + if (cache->free_space_ctl->free_space != (SZ_1M + 2 * sectorsize)) { + test_msg("Cache free space is not 1Mb + %u\n", 2 * sectorsize); return -EINVAL; } @@ -795,21 +802,25 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return -EINVAL; } - /* All that remains is a 8Kb free space region in a bitmap. Confirm. */ + /* + * All that remains is 2 * sectorsize free space region + * in a bitmap. Confirm. + */ ret = check_num_extents_and_bitmaps(cache, 1, 1); if (ret) return ret; - if (cache->free_space_ctl->free_space != 8192) { - test_msg("Cache free space is not 8Kb\n"); + if (cache->free_space_ctl->free_space != 2 * sectorsize) { + test_msg("Cache free space is not %u\n", 2 * sectorsize); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 8192, 0, + 0, 2 * sectorsize, 0, &max_extent_size); if (offset != SZ_32M) { - test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", + test_msg("Failed to allocate %u, offset: %llu\n", + 2 * sectorsize, offset); return -EINVAL; } @@ -824,29 +835,38 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) return 0; } -int btrfs_test_free_space_cache(void) +int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_block_group_cache *cache; struct btrfs_root *root = NULL; int ret = -ENOMEM; test_msg("Running btrfs free space cache tests\n"); - cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024); + /* + * For ppc64 (with 64k page size), bytes per bitmap might be + * larger than 1G. To make bitmap test available in ppc64, + * alloc dummy block group whose size cross bitmaps. + */ + cache = btrfs_alloc_dummy_block_group(BITS_PER_BITMAP * sectorsize + + PAGE_SIZE, sectorsize); if (!cache) { test_msg("Couldn't run the tests\n"); return 0; } - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + ret = PTR_ERR(root); goto out; + } root->fs_info->extent_root = root; cache->fs_info = root->fs_info; @@ -854,17 +874,18 @@ int btrfs_test_free_space_cache(void) ret = test_extents(cache); if (ret) goto out; - ret = test_bitmaps(cache); + ret = test_bitmaps(cache, sectorsize); if (ret) goto out; - ret = test_bitmaps_and_extents(cache); + ret = test_bitmaps_and_extents(cache, sectorsize); if (ret) goto out; - ret = test_steal_space_from_bitmap_to_extent(cache); + ret = test_steal_space_from_bitmap_to_extent(cache, sectorsize); out: btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); test_msg("Free space cache tests finished\n"); return ret; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 7cea4462acd5..7508d3b42780 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../disk-io.h" @@ -30,7 +31,7 @@ struct free_space_extent { * The test cases align their operations to this in order to hit some of the * edge cases in the bitmap code. */ -#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096) +#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE) static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -439,25 +440,27 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_block_group_cache *, struct btrfs_path *); -static int run_test(test_func_t test_func, int bitmaps) +static int run_test(test_func_t test_func, int bitmaps, + u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; struct btrfs_block_group_cache *cache = NULL; struct btrfs_trans_handle trans; struct btrfs_path *path = NULL; int ret; - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate dummy root\n"); - ret = PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate dummy root\n"); + ret = PTR_ERR(root); goto out; } @@ -466,7 +469,8 @@ static int run_test(test_func_t test_func, int bitmaps) root->fs_info->free_space_root = root; root->fs_info->tree_root = root; - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, + nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -474,9 +478,9 @@ static int run_test(test_func_t test_func, int bitmaps) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE); + cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize); if (!cache) { test_msg("Couldn't allocate dummy block group cache\n"); ret = -ENOMEM; @@ -531,20 +535,22 @@ out: btrfs_free_path(path); btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -static int run_test_both_formats(test_func_t test_func) +static int run_test_both_formats(test_func_t test_func, + u32 sectorsize, u32 nodesize) { int ret; - ret = run_test(test_func, 0); + ret = run_test(test_func, 0, sectorsize, nodesize); if (ret) return ret; - return run_test(test_func, 1); + return run_test(test_func, 1, sectorsize, nodesize); } -int btrfs_test_free_space_tree(void) +int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) { test_func_t tests[] = { test_empty_block_group, @@ -561,9 +567,11 @@ int btrfs_test_free_space_tree(void) test_msg("Running free space tree tests\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { - int ret = run_test_both_formats(tests[i]); + int ret = run_test_both_formats(tests[i], sectorsize, + nodesize); if (ret) { - test_msg("%pf failed\n", tests[i]); + test_msg("%pf : sectorsize %u failed\n", + tests[i], sectorsize); return ret; } } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 863a6a3af1f8..9f72aeda9220 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../btrfs_inode.h" @@ -86,19 +87,19 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288] - * [hole ][inline][ hole ][ regular ][regular1 split][ hole ] + * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split] * - * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056] - * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ] + * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] + * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] * - * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ] - * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent] + * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635] + * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1] * - * [81920-86016] - * [ regular ] + * [69635-73731][ 73731 - 86019 ][86019-90115] + * [ regular ][ hole but no extent][ regular ] */ -static void setup_file_extents(struct btrfs_root *root) +static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) { int slot = 0; u64 disk_bytenr = SZ_1M; @@ -119,7 +120,7 @@ static void setup_file_extents(struct btrfs_root *root) insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, slot); slot++; - offset = 4096; + offset = sectorsize; /* Now another hole */ insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, @@ -128,100 +129,108 @@ static void setup_file_extents(struct btrfs_root *root) offset += 4; /* Now for a regular extent */ - insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - disk_bytenr += 4096; - offset += 4095; + disk_bytenr += sectorsize; + offset += sectorsize - 1; /* * Now for 3 extents that were split from a hole punch so we test * offsets properly. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG, - 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0, + BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now for a unwritten prealloc extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; + offset += sectorsize; /* * We want to jack up disk_bytenr a little more so the em stuff doesn't * merge our records. */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* * Now for a partially written prealloc extent, basically the same as * the hole punch example above. Ram_bytes never changes when you mark * extents written btw. */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_PREALLOC, 0, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += sectorsize; + insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize, + disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, + slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); slot++; - offset += 8192; - disk_bytenr += 16384; + offset += 2 * sectorsize; + disk_bytenr += 4 * sectorsize; /* Now a normal compressed extent */ - insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0, + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; + offset += 2 * sectorsize; /* No merges */ - disk_bytenr += 8192; + disk_bytenr += 2 * sectorsize; /* Now a split compressed extent */ - insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); + insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, + BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096, + offset += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, + disk_bytenr + sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 4096; - insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096, + offset += sectorsize; + insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, + 2 * sectorsize, disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); slot++; - offset += 8192; - disk_bytenr += 8192; + offset += 2 * sectorsize; + disk_bytenr += 2 * sectorsize; /* Now extents that have a hole but no hole extent */ - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); slot++; - offset += 16384; - disk_bytenr += 4096; - insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096, - BTRFS_FILE_EXTENT_REG, 0, slot); + offset += 4 * sectorsize; + disk_bytenr += sectorsize; + insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } static unsigned long prealloc_only = 0; static unsigned long compressed_only = 0; static unsigned long vacancy_only = 0; -static noinline int test_btrfs_get_extent(void) +static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -240,23 +249,19 @@ static noinline int test_btrfs_get_extent(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - /* - * We do this since btrfs_get_extent wants to assign em->bdev to - * root->fs_info->fs_devices->latest_bdev. - */ - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -264,7 +269,7 @@ static noinline int test_btrfs_get_extent(void) /* * We will just free a dummy node if it's ref count is 2 so we need an - * extra ref so our searches don't accidently release our page. + * extra ref so our searches don't accidentally release our page. */ extent_buffer_get(root->node); btrfs_set_header_nritems(root->node, 0); @@ -273,7 +278,7 @@ static noinline int test_btrfs_get_extent(void) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, 0, sectorsize, 0); if (IS_ERR(em)) { em = NULL; test_msg("Got an error when we shouldn't have\n"); @@ -295,7 +300,7 @@ static noinline int test_btrfs_get_extent(void) * setup_file_extents, so if you change anything there you need to * update the comment and update the expected values below. */ - setup_file_extents(root); + setup_file_extents(root, sectorsize); em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0); if (IS_ERR(em)) { @@ -318,7 +323,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -327,7 +332,8 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected an inline, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4091) { + + if (em->start != offset || em->len != (sectorsize - 5)) { test_msg("Unexpected extent wanted start %llu len 1, got start " "%llu len %llu\n", offset, em->start, em->len); goto out; @@ -344,7 +350,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -366,7 +372,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -375,7 +381,7 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4095) { + if (em->start != offset || em->len != sectorsize - 1) { test_msg("Unexpected extent wanted start %llu len 4095, got " "start %llu len %llu\n", offset, em->start, em->len); goto out; @@ -393,7 +399,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -402,9 +408,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -421,7 +428,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -430,9 +437,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -442,7 +450,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -451,9 +459,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -475,7 +484,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -484,9 +493,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -503,7 +513,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -512,9 +522,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -532,7 +543,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -541,9 +552,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -564,7 +576,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -573,9 +585,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != prealloc_only) { @@ -598,7 +611,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -607,9 +620,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -631,7 +645,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -640,9 +654,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -665,7 +680,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -674,9 +689,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -691,7 +707,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -701,9 +717,10 @@ static noinline int test_btrfs_get_extent(void) disk_bytenr, em->block_start); goto out; } - if (em->start != offset || em->len != 8192) { - test_msg("Unexpected extent wanted start %llu len 8192, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 2 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 2 * sectorsize, em->start, em->len); goto out; } if (em->flags != compressed_only) { @@ -725,7 +742,7 @@ static noinline int test_btrfs_get_extent(void) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset + 6, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -734,9 +751,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -765,9 +783,10 @@ static noinline int test_btrfs_get_extent(void) * length of the actual hole, if this changes we'll have to change this * test. */ - if (em->start != offset || em->len != 12288) { - test_msg("Unexpected extent wanted start %llu len 12288, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != 3 * sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u, " + "got start %llu len %llu\n", + offset, 3 * sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -783,7 +802,7 @@ static noinline int test_btrfs_get_extent(void) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -792,9 +811,10 @@ static noinline int test_btrfs_get_extent(void) test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != offset || em->len != 4096) { - test_msg("Unexpected extent wanted start %llu len 4096, got " - "start %llu len %llu\n", offset, em->start, em->len); + if (em->start != offset || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %llu len %u," + "got start %llu len %llu\n", + offset, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -812,11 +832,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -static int test_hole_first(void) +static int test_hole_first(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; struct extent_map *em = NULL; @@ -832,19 +854,19 @@ static int test_hole_first(void) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } - root->node = alloc_dummy_extent_buffer(NULL, 4096); + root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -861,9 +883,9 @@ static int test_hole_first(void) * btrfs_get_extent. */ insert_inode_item_key(root); - insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096, - BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0); + insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, + sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); + em = btrfs_get_extent(inode, NULL, 0, 0, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; @@ -872,9 +894,10 @@ static int test_hole_first(void) test_msg("Expected a hole, got %llu\n", em->block_start); goto out; } - if (em->start != 0 || em->len != 4096) { - test_msg("Unexpected extent wanted start 0 len 4096, got start " - "%llu len %llu\n", em->start, em->len); + if (em->start != 0 || em->len != sectorsize) { + test_msg("Unexpected extent wanted start 0 len %u, " + "got start %llu len %llu\n", + sectorsize, em->start, em->len); goto out; } if (em->flags != vacancy_only) { @@ -884,18 +907,19 @@ static int test_hole_first(void) } free_extent_map(em); - em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0); + em = btrfs_get_extent(inode, NULL, 0, sectorsize, 2 * sectorsize, 0); if (IS_ERR(em)) { test_msg("Got an error when we shouldn't have\n"); goto out; } - if (em->block_start != 4096) { + if (em->block_start != sectorsize) { test_msg("Expected a real extent, got %llu\n", em->block_start); goto out; } - if (em->start != 4096 || em->len != 4096) { - test_msg("Unexpected extent wanted start 4096 len 4096, got " - "start %llu len %llu\n", em->start, em->len); + if (em->start != sectorsize || em->len != sectorsize) { + test_msg("Unexpected extent wanted start %u len %u, " + "got start %llu len %llu\n", + sectorsize, sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { @@ -909,11 +933,13 @@ out: free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -static int test_extent_accounting(void) +static int test_extent_accounting(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct inode *inode = NULL; struct btrfs_root *root = NULL; int ret = -ENOMEM; @@ -924,15 +950,15 @@ static int test_extent_accounting(void) return ret; } - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); goto out; } @@ -954,10 +980,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, - BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -969,10 +996,10 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_KERNEL); @@ -987,10 +1014,11 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ BTRFS_I(inode)->outstanding_extents++; ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + (BTRFS_MAX_EXTENT_SIZE >> 1) + + sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); @@ -1004,16 +1032,17 @@ static int test_extent_accounting(void) } /* - * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize HOLE][BTRFS_MAX_EXTENT_SIZE+sectorsize] * * I'm artificially adding 2 to outstanding_extents because in the * buffered IO case we'd add things up as we go, but I don't feel like * doing that here, this isn't the interesting case we want to test. */ BTRFS_I(inode)->outstanding_extents += 2; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, - (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, - NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, + (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, + NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1025,10 +1054,13 @@ static int test_extent_accounting(void) goto out; } - /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + /* + * [BTRFS_MAX_EXTENT_SIZE+sectorsize][sectorsize][BTRFS_MAX_EXTENT_SIZE+sectorsize] + */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1042,8 +1074,8 @@ static int test_extent_accounting(void) /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, NULL, GFP_KERNEL); @@ -1063,8 +1095,9 @@ static int test_extent_accounting(void) * might fail and I'd rather satisfy my paranoia at this point. */ BTRFS_I(inode)->outstanding_extents++; - ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, - BTRFS_MAX_EXTENT_SIZE+8191, NULL); + ret = btrfs_set_extent_delalloc(inode, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1100,10 +1133,11 @@ out: NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } -int btrfs_test_inodes(void) +int btrfs_test_inodes(u32 sectorsize, u32 nodesize) { int ret; @@ -1112,13 +1146,13 @@ int btrfs_test_inodes(void) set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); test_msg("Running btrfs_get_extent tests\n"); - ret = test_btrfs_get_extent(); + ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - ret = test_hole_first(); + ret = test_hole_first(sectorsize, nodesize); if (ret) return ret; test_msg("Running outstanding_extents tests\n"); - return test_extent_accounting(); + return test_extent_accounting(sectorsize, nodesize); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 8ea5d34bc5a2..4407fef7c16c 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -16,6 +16,7 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" #include "../transaction.h" @@ -216,7 +217,8 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, return ret; } -static int test_no_shared_qgroup(struct btrfs_root *root) +static int test_no_shared_qgroup(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -227,29 +229,30 @@ static int test_no_shared_qgroup(struct btrfs_root *root) btrfs_init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); - ret = btrfs_create_qgroup(NULL, fs_info, 5); + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FS_TREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } /* - * Since the test trans doesn't havee the complicated delayed refs, + * Since the test trans doesn't have the complicated delayed refs, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -257,32 +260,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_item(root, 4096, 4096); + ret = remove_extent_item(root, nodesize, nodesize); if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -290,14 +294,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -310,7 +314,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root) * right, also remove one of the roots and make sure the exclusive count is * adjusted properly. */ -static int test_multiple_refs(struct btrfs_root *root) +static int test_multiple_refs(struct btrfs_root *root, + u32 sectorsize, u32 nodesize) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; @@ -322,25 +327,29 @@ static int test_multiple_refs(struct btrfs_root *root) test_msg("Qgroup multiple refs test\n"); - /* We have 5 created already from the previous test */ - ret = btrfs_create_qgroup(NULL, fs_info, 256); + /* + * We have BTRFS_FS_TREE_OBJECTID created already from the + * previous test. + */ + ret = btrfs_create_qgroup(NULL, fs_info, BTRFS_FIRST_FREE_OBJECTID); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); + ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FS_TREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -348,30 +357,32 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = add_tree_ref(root, 4096, 4096, 0, 256); + ret = add_tree_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -379,35 +390,38 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + nodesize, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots); if (ret) { ulist_free(old_roots); test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = remove_extent_ref(root, 4096, 4096, 0, 256); + ret = remove_extent_ref(root, nodesize, nodesize, 0, + BTRFS_FIRST_FREE_OBJECTID); if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -415,19 +429,21 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, - old_roots, new_roots); + ret = btrfs_qgroup_account_extent(&trans, fs_info, nodesize, + nodesize, old_roots, new_roots); if (ret) { test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } - if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FIRST_FREE_OBJECTID, + 0, 0)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } - if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) { + if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID, + nodesize, nodesize)) { test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } @@ -435,24 +451,26 @@ static int test_multiple_refs(struct btrfs_root *root) return 0; } -int btrfs_test_qgroups(void) +int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info = NULL; struct btrfs_root *root; struct btrfs_root *tmp_root; int ret = 0; - root = btrfs_alloc_dummy_root(); - if (IS_ERR(root)) { - test_msg("Couldn't allocate root\n"); - return PTR_ERR(root); + fs_info = btrfs_alloc_dummy_fs_info(); + if (!fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + return -ENOMEM; } - root->fs_info = btrfs_alloc_dummy_fs_info(); - if (!root->fs_info) { - test_msg("Couldn't allocate dummy fs info\n"); - ret = -ENOMEM; + root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + ret = PTR_ERR(root); goto out; } + /* We are using this root as our extent root */ root->fs_info->extent_root = root; @@ -468,7 +486,8 @@ int btrfs_test_qgroups(void) * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, nodesize, + nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -476,16 +495,16 @@ int btrfs_test_qgroups(void) } btrfs_set_header_level(root->node, 0); btrfs_set_header_nritems(root->node, 0); - root->alloc_bytenr += 8192; + root->alloc_bytenr += 2 * nodesize; - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 5; + tmp_root->root_key.objectid = BTRFS_FS_TREE_OBJECTID; root->fs_info->fs_root = tmp_root; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { @@ -493,14 +512,14 @@ int btrfs_test_qgroups(void) goto out; } - tmp_root = btrfs_alloc_dummy_root(); + tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); goto out; } - tmp_root->root_key.objectid = 256; + tmp_root->root_key.objectid = BTRFS_FIRST_FREE_OBJECTID; ret = btrfs_insert_fs_root(root->fs_info, tmp_root); if (ret) { test_msg("Couldn't insert fs root %d\n", ret); @@ -508,11 +527,12 @@ int btrfs_test_qgroups(void) } test_msg("Running qgroup tests\n"); - ret = test_no_shared_qgroup(root); + ret = test_no_shared_qgroup(root, sectorsize, nodesize); if (ret) goto out; - ret = test_multiple_refs(root); + ret = test_multiple_refs(root, sectorsize, nodesize); out: btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43885e51b882..95d41919d034 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -311,10 +311,11 @@ loop: * when the transaction commits */ static int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + int force) { - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - root->last_trans < trans->transid) { + if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && + root->last_trans < trans->transid) || force) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); @@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, smp_wmb(); spin_lock(&root->fs_info->fs_roots_radix_lock); - if (root->last_trans == trans->transid) { + if (root->last_trans == trans->transid && !force) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } @@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&root->fs_info->reloc_mutex); - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); mutex_unlock(&root->fs_info->reloc_mutex); return 0; @@ -560,6 +561,7 @@ again: h->transaction = cur_trans; h->root = root; h->use_count = 1; + h->fs_info = root->fs_info; h->type = type; h->can_flush_pending_bgs = true; @@ -817,6 +819,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; + u64 transid = trans->transid; unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); int err = 0; @@ -904,7 +907,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(root, cur, + btrfs_async_run_delayed_refs(root, cur, transid, must_run_delayed_refs == 1); } return err; @@ -943,7 +946,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, err = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); + mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error @@ -1311,6 +1314,92 @@ int btrfs_defrag_root(struct btrfs_root *root) } /* + * Do all special snapshot related qgroup dirty hack. + * + * Will do all needed qgroup inherit and dirty hack like switch commit + * roots inside one transaction and write all btree into disk, to make + * qgroup works. + */ +static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_root *src, + struct btrfs_root *parent, + struct btrfs_qgroup_inherit *inherit, + u64 dst_objectid) +{ + struct btrfs_fs_info *fs_info = src->fs_info; + int ret; + + /* + * Save some performance in the case that qgroups are not + * enabled. If this check races with the ioctl, rescan will + * kick in anyway. + */ + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_enabled) { + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return 0; + } + mutex_unlock(&fs_info->qgroup_ioctl_lock); + + /* + * We are going to commit transaction, see btrfs_commit_transaction() + * comment for reason locking tree_log_mutex + */ + mutex_lock(&fs_info->tree_log_mutex); + + ret = commit_fs_roots(trans, src); + if (ret) + goto out; + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret < 0) + goto out; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret < 0) + goto out; + + /* Now qgroup are all updated, we can inherit it to new qgroups */ + ret = btrfs_qgroup_inherit(trans, fs_info, + src->root_key.objectid, dst_objectid, + inherit); + if (ret < 0) + goto out; + + /* + * Now we do a simplified commit transaction, which will: + * 1) commit all subvolume and extent tree + * To ensure all subvolume and extent tree have a valid + * commit_root to accounting later insert_dir_item() + * 2) write all btree blocks onto disk + * This is to make sure later btree modification will be cowed + * Or commit_root can be populated and cause wrong qgroup numbers + * In this simplified commit, we don't really care about other trees + * like chunk and root tree, as they won't affect qgroup. + * And we don't write super to avoid half committed status. + */ + ret = commit_cowonly_roots(trans, src); + if (ret) + goto out; + switch_commit_roots(trans->transaction, fs_info); + ret = btrfs_write_and_wait_transaction(trans, src); + if (ret) + btrfs_handle_fs_error(fs_info, ret, + "Error while writing out transaction for qgroup"); + +out: + mutex_unlock(&fs_info->tree_log_mutex); + + /* + * Force parent root to be updated, as we recorded it before so its + * last_trans == cur_transid. + * Or it won't be committed again onto disk after later + * insert_dir_item() + */ + if (!ret) + record_root_in_trans(trans, parent, 1); + return ret; +} + +/* * new snapshots need to be created at a very specific time in the * transaction commit. This does the actual creation. * @@ -1383,7 +1472,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root, 0); cur_time = current_fs_time(parent_inode->i_sb); @@ -1403,7 +1492,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } btrfs_release_path(path); @@ -1416,11 +1505,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_run_delayed_items(trans, root); if (ret) { /* Transaction aborted */ - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } - record_root_in_trans(trans, root); + record_root_in_trans(trans, root, 0); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1455,7 +1544,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1466,7 +1555,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ @@ -1480,7 +1569,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1492,7 +1581,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1500,22 +1589,33 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } + /* + * Do special qgroup accounting for snapshot, as we do some qgroup + * snapshot hack to do fast snapshot. + * To co-operate with that hack, we do hack again. + * Or snapshot will be greatly slowed down by a subtree qgroup rescan + */ + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + if (ret < 0) + goto fail; + ret = btrfs_insert_dir_item(trans, parent_root, dentry->d_name.name, dentry->d_name.len, parent_inode, &key, @@ -1523,7 +1623,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1533,13 +1633,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, current_fs_time(parent_inode->i_sb); ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { @@ -1548,31 +1648,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); if (ret && ret != -EEXIST) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } } ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - /* - * account qgroup counters before qgroup_inherit() - */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_account_extents(trans, fs_info); - if (ret) - goto fail; - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto fail; } @@ -1627,7 +1710,7 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (btrfs_test_opt(root, SPACE_CACHE)) + if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; if (root->fs_info->update_uuid_tree_gen) super->uuid_tree_generation = root_item->generation; @@ -1768,7 +1851,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, WARN_ON(trans->use_count > 1); - btrfs_abort_transaction(trans, root, err); + btrfs_abort_transaction(trans, err); spin_lock(&root->fs_info->trans_lock); @@ -1813,15 +1896,15 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) return btrfs_start_delalloc_roots(fs_info, 1, -1); return 0; } static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { - if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) - btrfs_wait_ordered_roots(fs_info, -1); + if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); } static inline void @@ -2145,7 +2228,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto scrub_continue; @@ -2195,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); + /* + * If fs has been frozen, we can not handle delayed iputs, otherwise + * it'll result in deadlock about SB_FREEZE_FS. + */ if (current != root->fs_info->transaction_kthread && - current != root->fs_info->cleaner_kthread) + current != root->fs_info->cleaner_kthread && + !root->fs_info->fs_frozen) btrfs_run_delayed_iputs(root); return ret; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 72be51f7ca2f..efb122643380 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -110,7 +110,6 @@ struct btrfs_trans_handle { u64 chunk_bytes_reserved; unsigned long use_count; unsigned long blocks_reserved; - unsigned long blocks_used; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; @@ -121,6 +120,7 @@ struct btrfs_trans_handle { bool can_flush_pending_bgs; bool reloc_reserved; bool sync; + bool dirty; unsigned int type; /* * this root is only needed to validate that the root passed to @@ -128,6 +128,7 @@ struct btrfs_trans_handle { * Subvolume quota depends on this */ struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct seq_list delayed_ref_elem; struct list_head qgroup_ref_list; struct list_head new_bgs; @@ -144,7 +145,7 @@ struct btrfs_pending_snapshot { /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; - /* extra metadata reseration for relocation */ + /* extra metadata reservation for relocation */ int error; bool readonly; struct list_head list; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e692eea87af6..ef9c55bc7907 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "backref.h" #include "hash.h" #include "compression.h" +#include "qgroup.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; offset = key->offset - btrfs_file_extent_offset(eb, item); + /* + * Manually record dirty extent, as here we did a shallow + * file extent item copy and skip normal backref update, + * but modifying extent tree all by ourselves. + * So need to manually record dirty extent for qgroup, + * as the owner of the file extent changed from log tree + * (doesn't affect qgroup) to fs/file tree(affects qgroup) + */ + ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + btrfs_file_extent_disk_bytenr(eb, item), + btrfs_file_extent_disk_num_bytes(eb, item), + GFP_NOFS); + if (ret < 0) + goto out; + if (ins.objectid > 0) { u64 csum_start; u64 csum_end; @@ -2330,7 +2346,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, break; /* for regular files, make sure corresponding - * orhpan item exist. extents past the new EOF + * orphan item exist. extents past the new EOF * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { @@ -2422,8 +2438,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(parent); next = btrfs_find_create_tree_block(root, bytenr); - if (!next) - return -ENOMEM; + if (IS_ERR(next)) + return PTR_ERR(next); if (*level == 1) { ret = wc->process_func(root, next, wc, ptr_gen); @@ -2757,7 +2773,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root, SSD) && + if (!btrfs_test_opt(root->fs_info, SSD) && test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); @@ -2788,7 +2804,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); if (ret) { blk_finish_plug(&plug); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); btrfs_set_log_full_commit(root->fs_info, trans); mutex_unlock(&root->log_mutex); @@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); - btrfs_init_log_ctx(&root_log_ctx); + btrfs_init_log_ctx(&root_log_ctx, NULL); mutex_lock(&log_root_tree->log_mutex); atomic_inc(&log_root_tree->log_batch); @@ -2838,7 +2854,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); if (ret != -ENOSPC) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } @@ -2851,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { blk_finish_plug(&plug); + list_del_init(&root_log_ctx.list); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; @@ -2898,7 +2915,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; @@ -2934,7 +2951,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = write_ctree_super(trans, root->fs_info->tree_root, 1); if (ret) { btrfs_set_log_full_commit(root->fs_info, trans); - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -2991,7 +3008,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, ret = walk_log_tree(trans, log, &wc); /* I don't think this can happen but just in case */ if (ret) - btrfs_abort_transaction(trans, log, ret); + btrfs_abort_transaction(trans, ret); while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, @@ -3001,7 +3018,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, break; clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); + EXTENT_DIRTY | EXTENT_NEW); } /* @@ -3160,7 +3177,7 @@ out_unlock: btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); @@ -3193,7 +3210,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, btrfs_set_log_full_commit(root->fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); return ret; @@ -4141,6 +4158,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&extents); + down_write(&BTRFS_I(inode)->dio_sem); write_lock(&tree->lock); test_gen = root->fs_info->last_trans_committed; @@ -4169,13 +4187,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); + btrfs_get_logged_extents(inode, logged_list, start, end); /* - * Collect any new ordered extents within the range. This is to - * prevent logging file extent items without waiting for the disk - * location they point to being written. We do this only to deal - * with races against concurrent lockless direct IO writes. + * Some ordered extents started by fsync might have completed + * before we could collect them into the list logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. */ - btrfs_get_logged_extents(inode, logged_list, start, end); + ret = btrfs_inode_check_errors(inode); + if (ret) + ctx->io_err = ret; process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4202,6 +4227,7 @@ process: } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); + up_write(&BTRFS_I(inode)->dio_sem); btrfs_release_path(path); return ret; @@ -4460,7 +4486,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, static int btrfs_check_ref_name_override(struct extent_buffer *eb, const int slot, const struct btrfs_key *key, - struct inode *inode) + struct inode *inode, + u64 *other_ino) { int ret; struct btrfs_path *search_path; @@ -4519,7 +4546,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, search_path, parent, name, this_name_len, 0); if (di && !IS_ERR(di)) { - ret = 1; + struct btrfs_key di_key; + + btrfs_dir_item_key_to_cpu(search_path->nodes[0], + di, &di_key); + if (di_key.type == BTRFS_INODE_ITEM_KEY) { + ret = 1; + *other_ino = di_key.objectid; + } else { + ret = -EAGAIN; + } goto out; } else if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -4623,23 +4659,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, mutex_lock(&BTRFS_I(inode)->log_mutex); /* - * Collect ordered extents only if we are logging data. This is to - * ensure a subsequent request to log this inode in LOG_INODE_ALL mode - * will process the ordered extents if they still exists at the time, - * because when we collect them we test and set for the flag - * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the - * same ordered extents. The consequence for the LOG_INODE_ALL log mode - * not processing the ordered extents is that we end up logging the - * corresponding file extent items, based on the extent maps in the - * inode's extent_map_tree's modified_list, without logging the - * respective checksums (since the may still be only attached to the - * ordered extents and have not been inserted in the csum tree by - * btrfs_finish_ordered_io() yet). - */ - if (inode_only == LOG_INODE_ALL) - btrfs_get_logged_extents(inode, &logged_list, start, end); - - /* * a brute force approach to making sure we get the most uptodate * copies of everything. */ @@ -4711,6 +4730,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ins_nr = 0; ret = btrfs_search_forward(root, &min_key, path, trans->transid); + if (ret < 0) { + err = ret; + goto out_unlock; + } if (ret != 0) break; again: @@ -4726,16 +4749,72 @@ again: if ((min_key.type == BTRFS_INODE_REF_KEY || min_key.type == BTRFS_INODE_EXTREF_KEY) && BTRFS_I(inode)->generation == trans->transid) { + u64 other_ino = 0; + ret = btrfs_check_ref_name_override(path->nodes[0], path->slots[0], - &min_key, inode); + &min_key, inode, + &other_ino); if (ret < 0) { err = ret; goto out_unlock; - } else if (ret > 0) { - err = 1; - btrfs_set_log_full_commit(root->fs_info, trans); - goto out_unlock; + } else if (ret > 0 && ctx && + other_ino != btrfs_ino(ctx->inode)) { + struct btrfs_key inode_key; + struct inode *other_inode; + + if (ins_nr > 0) { + ins_nr++; + } else { + ins_nr = 1; + ins_start_slot = path->slots[0]; + } + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, + logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + btrfs_release_path(path); + inode_key.objectid = other_ino; + inode_key.type = BTRFS_INODE_ITEM_KEY; + inode_key.offset = 0; + other_inode = btrfs_iget(root->fs_info->sb, + &inode_key, root, + NULL); + /* + * If the other inode that had a conflicting dir + * entry was deleted in the current transaction, + * we don't need to do more work nor fallback to + * a transaction commit. + */ + if (IS_ERR(other_inode) && + PTR_ERR(other_inode) == -ENOENT) { + goto next_key; + } else if (IS_ERR(other_inode)) { + err = PTR_ERR(other_inode); + goto out_unlock; + } + /* + * We are safe logging the other inode without + * acquiring its i_mutex as long as we log with + * the LOG_INODE_EXISTS mode. We're safe against + * concurrent renames of the other inode as well + * because during a rename we pin the log and + * update the log with the new name before we + * unpin it. + */ + err = btrfs_log_inode(trans, root, other_inode, + LOG_INODE_EXISTS, + 0, LLONG_MAX, ctx); + iput(other_inode); + if (err) + goto out_unlock; + else + goto next_key; } } @@ -4803,7 +4882,7 @@ next_slot: ins_nr = 0; } btrfs_release_path(path); - +next_key: if (min_key.offset < (u64)-1) { min_key.offset++; } else if (min_key.type < max_key.type) { @@ -4846,21 +4925,6 @@ log_extents: goto out_unlock; } if (fast_search) { - /* - * Some ordered extents started by fsync might have completed - * before we collected the ordered extents in logged_list, which - * means they're gone, not in our logged_list nor in the inode's - * ordered tree. We want the application/user space to know an - * error happened while attempting to persist file data so that - * it can take proper action. If such error happened, we leave - * without writing to the log tree and the fsync must report the - * file data write error and not commit the current transaction. - */ - err = btrfs_inode_check_errors(inode); - if (err) { - ctx->io_err = err; - goto out_unlock; - } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx, start, end); if (ret) { @@ -4937,7 +5001,7 @@ out_unlock: * the actual unlink operation, so if we do this check before a concurrent task * sets last_unlink_trans it means we've logged a consistent version/state of * all the inode items, otherwise we are not sure and must do a transaction - * commit (the concurrent task migth have only updated last_unlink_trans before + * commit (the concurrent task might have only updated last_unlink_trans before * we logged the inode or it might have also done the unlink). */ static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans, @@ -4996,7 +5060,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, while (1) { /* * If we are logging a directory then we start with our inode, - * not our parents inode, so we need to skipp setting the + * not our parent's inode, so we need to skip setting the * logged_trans so that further down in the log code we don't * think this inode has already been logged. */ @@ -5012,8 +5076,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (!parent || d_really_is_negative(parent) || sb != parent->d_sb) break; - if (IS_ROOT(parent)) + if (IS_ROOT(parent)) { + inode = d_inode(parent); + if (btrfs_must_commit_transaction(trans, inode)) + ret = 1; break; + } parent = dget_parent(parent); dput(old_parent); @@ -5158,7 +5226,7 @@ process_leaf: } ctx->log_new_dentries = false; - if (type == BTRFS_FT_DIR) + if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, @@ -5278,11 +5346,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (IS_ERR(dir_inode)) continue; + if (ctx) + ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, dir_inode, LOG_INODE_ALL, 0, LLONG_MAX, ctx); if (!ret && btrfs_must_commit_transaction(trans, dir_inode)) ret = 1; + if (!ret && ctx && ctx->log_new_dentries) + ret = log_new_dir_dentries(trans, root, + dir_inode, ctx); iput(dir_inode); if (ret) goto out; @@ -5319,7 +5392,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; - if (btrfs_test_opt(root, NOTREELOG)) { + if (btrfs_test_opt(root->fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; } @@ -5375,7 +5448,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, log_dentries = true; /* - * On unlink we must make sure all our current and old parent directores + * On unlink we must make sure all our current and old parent directory * inodes are fully logged. This is to prevent leaving dangling * directory index entries in directories that were our parents but are * not anymore. Not doing this results in old parent directory being @@ -5519,7 +5592,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_std_error(fs_info, ret, "Failed to pin buffers while " + btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -5533,7 +5606,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5551,7 +5624,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_std_error(fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5566,7 +5639,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_std_error(fs_info, ret, "Couldn't read target root " + btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; } @@ -5652,11 +5725,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, * into the file. When the file is logged we check it and * don't log the parents if the file is fully on disk. */ - if (S_ISREG(inode->i_mode)) { - mutex_lock(&BTRFS_I(inode)->log_mutex); - BTRFS_I(inode)->last_unlink_trans = trans->transid; - mutex_unlock(&BTRFS_I(inode)->log_mutex); - } + mutex_lock(&BTRFS_I(inode)->log_mutex); + BTRFS_I(inode)->last_unlink_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); /* * if this directory was already logged any new diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index a9f1b75d080d..ab858e31ccbc 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -30,15 +30,18 @@ struct btrfs_log_ctx { int log_transid; int io_err; bool log_new_dentries; + struct inode *inode; struct list_head list; }; -static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) +static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, + struct inode *inode) { ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; ctx->log_new_dentries = false; + ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 91feb2bdefee..b1434bb57e36 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -28,7 +28,7 @@ * } * ulist_free(ulist); * - * This assumes the graph nodes are adressable by u64. This stems from the + * This assumes the graph nodes are addressable by u64. This stems from the * usage for tree enumeration in btrfs, where the logical addresses are * 64 bit. * diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bd0f45fb38c4..035efce603a9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,13 +20,13 @@ #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> -#include <linux/random.h> #include <linux/iocontext.h> #include <linux/capability.h> #include <linux/ratelimit.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/semaphore.h> +#include <linux/uuid.h> #include <asm/div64.h> #include "ctree.h" #include "extent_map.h" @@ -118,6 +118,21 @@ const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, }; +/* + * Table to convert BTRFS_RAID_* to the error code if minimum number of devices + * condition is not met. Zero means there's no corresponding + * BTRFS_ERROR_DEV_*_NOT_MET value. + */ +const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, + [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + [BTRFS_RAID_DUP] = 0, + [BTRFS_RAID_RAID0] = 0, + [BTRFS_RAID_SINGLE] = 0, + [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, + [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, +}; + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -125,7 +140,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static void btrfs_close_one_device(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -447,7 +461,7 @@ loop_lock: sync_pending = 0; } - btrfsic_submit_bio(cur->bi_rw, cur); + btrfsic_submit_bio(cur); num_run++; batch_run++; @@ -699,7 +713,8 @@ static noinline int device_list_add(const char *path, * if there is new btrfs on an already registered device, * then remove the stale device entry. */ - btrfs_free_stale_device(device); + if (ret > 0) + btrfs_free_stale_device(device); *fs_devices_ret = fs_devices; @@ -819,10 +834,6 @@ static void __free_device(struct work_struct *work) struct btrfs_device *device; device = container_of(work, struct btrfs_device, rcu_work); - - if (device->bdev) - blkdev_put(device->bdev, device->mode); - rcu_string_free(device->name); kfree(device); } @@ -837,6 +848,54 @@ static void free_device(struct rcu_head *head) schedule_work(&device->rcu_work); } +static void btrfs_close_bdev(struct btrfs_device *device) +{ + if (device->bdev && device->writeable) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + + if (device->bdev) + blkdev_put(device->bdev, device->mode); +} + +static void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + btrfs_close_bdev(device); + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); +} + static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; @@ -988,6 +1047,56 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, return ret; } +void btrfs_release_disk_super(struct page *page) +{ + kunmap(page); + put_page(page); +} + +int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, + struct page **page, struct btrfs_super_block **disk_super) +{ + void *p; + pgoff_t index; + + /* make sure our super fits in the device */ + if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) + return 1; + + /* make sure our super fits in the page */ + if (sizeof(**disk_super) > PAGE_SIZE) + return 1; + + /* make sure our super doesn't straddle pages on disk */ + index = bytenr >> PAGE_SHIFT; + if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index) + return 1; + + /* pull in the page with our super */ + *page = read_cache_page_gfp(bdev->bd_inode->i_mapping, + index, GFP_KERNEL); + + if (IS_ERR_OR_NULL(*page)) + return 1; + + p = kmap(*page); + + /* align our pointer to the offset of the super block */ + *disk_super = p + (bytenr & ~PAGE_MASK); + + if (btrfs_super_bytenr(*disk_super) != bytenr || + btrfs_super_magic(*disk_super) != BTRFS_MAGIC) { + btrfs_release_disk_super(*page); + return 1; + } + + if ((*disk_super)->label[0] && + (*disk_super)->label[BTRFS_LABEL_SIZE - 1]) + (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0'; + + return 0; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock @@ -999,13 +1108,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_super_block *disk_super; struct block_device *bdev; struct page *page; - void *p; int ret = -EINVAL; u64 devid; u64 transid; u64 total_devices; u64 bytenr; - pgoff_t index; /* * we would like to check all the supers, but that would make @@ -1018,41 +1125,14 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, mutex_lock(&uuid_mutex); bdev = blkdev_get_by_path(path, flags, holder); - if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto error; } - /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) - goto error_bdev_put; - - /* make sure our super fits in the page */ - if (sizeof(*disk_super) > PAGE_SIZE) - goto error_bdev_put; - - /* make sure our super doesn't straddle pages on disk */ - index = bytenr >> PAGE_SHIFT; - if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) - goto error_bdev_put; - - /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_inode->i_mapping, - index, GFP_NOFS); - - if (IS_ERR_OR_NULL(page)) + if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) goto error_bdev_put; - p = kmap(page); - - /* align our pointer to the offset of the super block */ - disk_super = p + (bytenr & ~PAGE_MASK); - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) - goto error_unmap; - devid = btrfs_stack_device_id(&disk_super->dev_item); transid = btrfs_super_generation(disk_super); total_devices = btrfs_super_num_devices(disk_super); @@ -1060,8 +1140,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, ret = device_list_add(path, disk_super, devid, fs_devices_ret); if (ret > 0) { if (disk_super->label[0]) { - if (disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); } else { printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); @@ -1073,9 +1151,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, if (!ret && fs_devices_ret) (*fs_devices_ret)->total_devices = total_devices; -error_unmap: - kunmap(page); - put_page(page); + btrfs_release_disk_super(page); error_bdev_put: blkdev_put(bdev, flags); @@ -1454,7 +1530,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_std_error(root->fs_info, ret, "Slot search failed"); + btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed"); goto out; } @@ -1462,7 +1538,7 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to remove dev extent item"); } else { set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); @@ -1688,32 +1764,92 @@ out: return ret; } -int btrfs_rm_device(struct btrfs_root *root, char *device_path) +/* + * Verify that @num_devices satisfies the RAID profile constraints in the whole + * filesystem. It's up to the caller to adjust that number regarding eg. device + * replace. + */ +static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, + u64 num_devices) +{ + u64 all_avail; + unsigned seq; + int i; + + do { + seq = read_seqbegin(&fs_info->profiles_lock); + + all_avail = fs_info->avail_data_alloc_bits | + fs_info->avail_system_alloc_bits | + fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!(all_avail & btrfs_raid_group[i])) + continue; + + if (num_devices < btrfs_raid_array[i].devs_min) { + int ret = btrfs_raid_mindev_error[i]; + + if (ret) + return ret; + } + } + + return 0; +} + +struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs, + struct btrfs_device *device) { - struct btrfs_device *device; struct btrfs_device *next_device; - struct block_device *bdev; - struct buffer_head *bh = NULL; - struct btrfs_super_block *disk_super; + + list_for_each_entry(next_device, &fs_devs->devices, dev_list) { + if (next_device != device && + !next_device->missing && next_device->bdev) + return next_device; + } + + return NULL; +} + +/* + * Helper function to check if the given device is part of s_bdev / latest_bdev + * and replace it with the provided or the next active device, in the context + * where this function called, there should be always be another device (or + * this_dev) which is active. + */ +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev) +{ + struct btrfs_device *next_device; + + if (this_dev) + next_device = this_dev; + else + next_device = btrfs_find_next_active_device(fs_info->fs_devices, + device); + ASSERT(next_device); + + if (fs_info->sb->s_bdev && + (fs_info->sb->s_bdev == device->bdev)) + fs_info->sb->s_bdev = next_device->bdev; + + if (fs_info->fs_devices->latest_bdev == device->bdev) + fs_info->fs_devices->latest_bdev = next_device->bdev; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) +{ + struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; - u64 all_avail; - u64 devid; u64 num_devices; - u8 *dev_uuid; - unsigned seq; int ret = 0; bool clear_super = false; + char *dev_name = NULL; mutex_lock(&uuid_mutex); - do { - seq = read_seqbegin(&root->fs_info->profiles_lock); - - all_avail = root->fs_info->avail_data_alloc_bits | - root->fs_info->avail_system_alloc_bits | - root->fs_info->avail_metadata_alloc_bits; - } while (read_seqretry(&root->fs_info->profiles_lock, seq)); - num_devices = root->fs_info->fs_devices->num_devices; btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { @@ -1722,78 +1858,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); - if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { - ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; - goto out; - } - - if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET; + ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1); + if (ret) goto out; - } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && - root->fs_info->fs_devices->rw_devices <= 2) { - ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; - goto out; - } - if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && - root->fs_info->fs_devices->rw_devices <= 3) { - ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET; + ret = btrfs_find_device_by_devspec(root, devid, device_path, + &device); + if (ret) goto out; - } - - if (strcmp(device_path, "missing") == 0) { - struct list_head *devices; - struct btrfs_device *tmp; - - device = NULL; - devices = &root->fs_info->fs_devices->devices; - /* - * It is safe to read the devices since the volume_mutex - * is held. - */ - list_for_each_entry(tmp, devices, dev_list) { - if (tmp->in_fs_metadata && - !tmp->is_tgtdev_for_dev_replace && - !tmp->bdev) { - device = tmp; - break; - } - } - bdev = NULL; - bh = NULL; - disk_super = NULL; - if (!device) { - ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; - goto out; - } - } else { - ret = btrfs_get_bdev_and_sb(device_path, - FMODE_WRITE | FMODE_EXCL, - root->fs_info->bdev_holder, 0, - &bdev, &bh); - if (ret) - goto out; - disk_super = (struct btrfs_super_block *)bh->b_data; - devid = btrfs_stack_device_id(&disk_super->dev_item); - dev_uuid = disk_super->dev_item.uuid; - device = btrfs_find_device(root->fs_info, devid, dev_uuid, - disk_super->fsid); - if (!device) { - ret = -ENOENT; - goto error_brelse; - } - } if (device->is_tgtdev_for_dev_replace) { ret = BTRFS_ERROR_DEV_TGT_REPLACE; - goto error_brelse; + goto out; } if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; - goto error_brelse; + goto out; } if (device->writeable) { @@ -1801,6 +1882,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; unlock_chunks(root); + dev_name = kstrdup(device->name->str, GFP_KERNEL); + if (!dev_name) { + ret = -ENOMEM; + goto error_undo; + } clear_super = true; } @@ -1842,12 +1928,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->missing) device->fs_devices->missing_devices--; - next_device = list_entry(root->fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (device->bdev == root->fs_info->sb->s_bdev) - root->fs_info->sb->s_bdev = next_device->bdev; - if (device->bdev == root->fs_info->fs_devices->latest_bdev) - root->fs_info->fs_devices->latest_bdev = next_device->bdev; + btrfs_assign_next_active_device(root->fs_info, device, NULL); if (device->bdev) { device->fs_devices->open_devices--; @@ -1855,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); } + btrfs_close_bdev(device); + call_rcu(&device->rcu, free_device); num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; @@ -1883,63 +1966,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) * at this point, the device is zero sized. We want to * remove it from the devices list and zero out the old super */ - if (clear_super && disk_super) { - u64 bytenr; - int i; - - /* make sure this device isn't detected as part of - * the FS anymore - */ - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - - /* clear the mirror copies of super block on the disk - * being removed, 0th copy is been taken care above and - * the below would take of the rest - */ - for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - i_size_read(bdev->bd_inode)) - break; - - brelse(bh); - bh = __bread(bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) - continue; - - disk_super = (struct btrfs_super_block *)bh->b_data; - - if (btrfs_super_bytenr(disk_super) != bytenr || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - continue; - } - memset(&disk_super->magic, 0, - sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); + if (clear_super) { + struct block_device *bdev; + + bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); + if (!IS_ERR(bdev)) { + btrfs_scratch_superblocks(bdev, dev_name); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); } } - ret = 0; - - if (bdev) { - /* Notify udev that device has changed */ - btrfs_kobject_uevent(bdev, KOBJ_CHANGE); - - /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); - } - -error_brelse: - brelse(bh); - if (bdev) - blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: + kfree(dev_name); + mutex_unlock(&uuid_mutex); return ret; + error_undo: if (device->writeable) { lock_chunks(root); @@ -1948,7 +1991,7 @@ error_undo: device->fs_devices->rw_devices++; unlock_chunks(root); } - goto error_brelse; + goto out; } void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, @@ -1972,11 +2015,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->missing) fs_devices->missing_devices--; - if (srcdev->writeable) { + if (srcdev->writeable) fs_devices->rw_devices--; - /* zero out the old super if it is writable */ - btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); - } if (srcdev->bdev) fs_devices->open_devices--; @@ -1987,6 +2027,13 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; + if (srcdev->writeable) { + /* zero out the old super if it is writable */ + btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str); + } + + btrfs_close_bdev(srcdev); + call_rcu(&srcdev->rcu, free_device); /* @@ -2016,32 +2063,35 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev) { - struct btrfs_device *next_device; - mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); - if (tgtdev->bdev) { - btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + if (tgtdev->bdev) fs_info->fs_devices->open_devices--; - } + fs_info->fs_devices->num_devices--; - next_device = list_entry(fs_info->fs_devices->devices.next, - struct btrfs_device, dev_list); - if (tgtdev->bdev == fs_info->sb->s_bdev) - fs_info->sb->s_bdev = next_device->bdev; - if (tgtdev->bdev == fs_info->fs_devices->latest_bdev) - fs_info->fs_devices->latest_bdev = next_device->bdev; - list_del_rcu(&tgtdev->dev_list); + btrfs_assign_next_active_device(fs_info, tgtdev, NULL); - call_rcu(&tgtdev->rcu, free_device); + list_del_rcu(&tgtdev->dev_list); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + + /* + * The update_dev_time() with in btrfs_scratch_superblocks() + * may lead to a call to btrfs_show_devname() which will try + * to hold device_list_mutex. And here this device + * is already out of device list, so we don't have to hold + * the device_list_mutex lock. + */ + btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str); + + btrfs_close_bdev(tgtdev); + call_rcu(&tgtdev->rcu, free_device); } static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, @@ -2102,6 +2152,31 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } /* + * Lookup a device given by device id, or the path if the id is 0. + */ +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, + struct btrfs_device **device) +{ + int ret; + + if (devid) { + ret = 0; + *device = btrfs_find_device(root->fs_info, devid, NULL, + NULL); + if (!*device) + ret = -ENOENT; + } else { + if (!devpath || !devpath[0]) + return -EINVAL; + + ret = btrfs_find_device_missing_or_by_path(root, devpath, + device); + } + return ret; +} + +/* * does all the dirty work required for changing file system's UUID. */ static int btrfs_prepare_sprout(struct btrfs_root *root) @@ -2165,7 +2240,7 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) } /* - * strore the expected generation for seed devices in device items. + * Store the expected generation for seed devices in device items. */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, struct btrfs_root *root) @@ -2374,14 +2449,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = init_first_rw_device(trans, root, device); unlock_chunks(root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } } ret = btrfs_add_device(trans, root, device); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2390,7 +2465,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_finish_sprout(trans, root); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto error_trans; } @@ -2418,7 +2493,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); @@ -2663,7 +2738,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_std_error(root->fs_info, -ENOENT, + btrfs_handle_fs_error(root->fs_info, -ENOENT, "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; @@ -2671,7 +2746,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_std_error(root->fs_info, ret, + btrfs_handle_fs_error(root->fs_info, ret, "Failed to delete chunk item."); out: btrfs_free_path(path); @@ -2736,6 +2811,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; int i, ret = 0; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; /* Just in case */ root = root->fs_info->chunk_root; @@ -2762,13 +2838,20 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); if (ret) { - btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2786,14 +2869,17 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { - btrfs_abort_transaction(trans, root, ret); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_abort_transaction(trans, ret); goto out; } } } + mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2802,14 +2888,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); if (ret) { - btrfs_abort_transaction(trans, root, ret); + btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { - btrfs_abort_transaction(trans, extent_root, ret); + btrfs_abort_transaction(trans, ret); goto out; } @@ -2857,7 +2943,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) chunk_offset); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(root->fs_info, ret, NULL); return ret; } @@ -2866,7 +2952,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) * chunk tree entries */ ret = btrfs_remove_chunk(trans, root, chunk_offset); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans, extent_root); return ret; } @@ -3362,7 +3448,7 @@ static int should_balance_chunk(struct btrfs_root *root, } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { /* * Same logic as the 'limit' filter; the minimum cannot be - * determined here because we do not have the global informatoin + * determined here because we do not have the global information * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) @@ -3385,7 +3471,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u64 size_to_free; u64 chunk_type; struct btrfs_chunk *chunk; - struct btrfs_path *path; + struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_trans_handle *trans; @@ -3402,6 +3488,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) u32 count_meta = 0; u32 count_sys = 0; int chunk_reserved = 0; + u64 bytes_used = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3418,13 +3505,33 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) ret = btrfs_shrink_device(device, old_size - size_to_free); if (ret == -ENOSPC) break; - BUG_ON(ret); + if (ret) { + /* btrfs_shrink_device never returns ret > 0 */ + WARN_ON(ret > 0); + goto error; + } trans = btrfs_start_transaction(dev_root, 0); - BUG_ON(IS_ERR(trans)); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_info_in_rcu(fs_info, + "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } ret = btrfs_grow_device(trans, device, old_size); - BUG_ON(ret); + if (ret) { + btrfs_end_transaction(trans, dev_root); + /* btrfs_grow_device never returns ret > 0 */ + WARN_ON(ret > 0); + btrfs_info_in_rcu(fs_info, + "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu", + rcu_str_deref(device->name), ret, + old_size, old_size - size_to_free); + goto error; + } btrfs_end_transaction(trans, dev_root); } @@ -3540,7 +3647,13 @@ again: goto loop; } - if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) { + ASSERT(fs_info->data_sinfo); + spin_lock(&fs_info->data_sinfo->lock); + bytes_used = fs_info->data_sinfo->bytes_used; + spin_unlock(&fs_info->data_sinfo->lock); + + if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + !chunk_reserved && !bytes_used) { trans = btrfs_start_transaction(chunk_root, 0); if (IS_ERR(trans)) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); @@ -3632,7 +3745,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); if (ret) - btrfs_std_error(fs_info, ret, NULL); + btrfs_handle_fs_error(fs_info, ret, NULL); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } @@ -3693,10 +3806,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl, num_devices--; } btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); - allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; - if (num_devices == 1) - allowed |= BTRFS_BLOCK_GROUP_DUP; - else if (num_devices > 1) + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP; + if (num_devices > 1) allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); if (num_devices > 2) allowed |= BTRFS_BLOCK_GROUP_RAID5; @@ -3844,7 +3955,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->balance_lock); - if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) { + if (btrfs_test_opt(fs_info, SKIP_BALANCE)) { btrfs_info(fs_info, "force skipping balance"); return 0; } @@ -4199,7 +4310,8 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); - btrfs_abort_transaction(trans, tree_root, ret); + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans, tree_root); return ret; } @@ -4472,8 +4584,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \ - - sizeof(struct btrfs_item) \ +#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -4652,12 +4763,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_RAID5) { raid_stripe_len = find_raid56_stripe_len(ndevs - 1, - btrfs_super_stripesize(info->super_copy)); + extent_root->stripesize); data_stripes = num_stripes - 1; } if (type & BTRFS_BLOCK_GROUP_RAID6) { raid_stripe_len = find_raid56_stripe_len(ndevs - 2, - btrfs_super_stripesize(info->super_copy)); + extent_root->stripesize); data_stripes = num_stripes - 2; } @@ -5218,7 +5329,7 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) kfree(bbio); } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) @@ -5278,7 +5389,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; - BUG_ON(offset < stripe_offset); + if (offset < stripe_offset) { + btrfs_crit(fs_info, "stripe math has gone wrong, " + "stripe_offset=%llu, offset=%llu, start=%llu, " + "logical=%llu, stripe_len=%llu", + stripe_offset, offset, em->start, logical, + stripe_len); + free_extent_map(em); + return -EINVAL; + } /* stripe_offset is the offset of this block in its stripe*/ stripe_offset = offset - stripe_offset; @@ -5296,7 +5415,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, raid56_full_stripe_start *= full_stripe_len; } - if (rw & REQ_DISCARD) { + if (op == REQ_OP_DISCARD) { /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; @@ -5309,7 +5428,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (rw & REQ_WRITE)) { + (op == REQ_OP_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); } else { @@ -5334,8 +5453,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && - dev_replace->tgtdev != NULL) { + op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { /* * in dev-replace case, for repair case (that's the only * case where the mirror is selected explicitly when @@ -5422,15 +5541,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, (offset + *length); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (rw & REQ_DISCARD) + if (op == REQ_OP_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) + if (op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5443,7 +5564,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) { + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || + op == REQ_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5457,9 +5579,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (rw & REQ_DISCARD) + else if (op == REQ_OP_DISCARD) num_stripes = min_t(u64, map->sub_stripes * (stripe_nr_end - stripe_nr_orig), map->num_stripes); @@ -5477,7 +5599,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (need_raid_map && - ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div_u64(raid56_full_stripe_start, @@ -5505,8 +5627,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if (!(rw & (REQ_WRITE | REQ_DISCARD | - REQ_GET_READ_MIRRORS)) && mirror_num <= 1) + if ((op != REQ_OP_WRITE && op != REQ_OP_DISCARD && + op != REQ_GET_READ_MIRRORS) && mirror_num <= 1) mirror_num = 1; } } else { @@ -5519,13 +5641,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, &stripe_index); mirror_num = stripe_index + 1; } - BUG_ON(stripe_index >= map->num_stripes); + if (stripe_index >= map->num_stripes) { + btrfs_crit(fs_info, "stripe index math went horribly wrong, " + "got stripe_index=%u, num_stripes=%u", + stripe_index, map->num_stripes); + ret = -EINVAL; + goto out; + } num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { - if (rw & (REQ_WRITE | REQ_DISCARD)) + if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) num_alloc_stripes <<= 1; - if (rw & REQ_GET_READ_MIRRORS) + if (op == REQ_GET_READ_MIRRORS) num_alloc_stripes++; tgtdev_indexes = num_stripes; } @@ -5540,7 +5668,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + need_raid_map && + ((op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5565,7 +5694,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, RAID6_Q_STRIPE; } - if (rw & REQ_DISCARD) { + if (op == REQ_OP_DISCARD) { u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; @@ -5645,14 +5774,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); tgtdev_indexes = 0; - if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && + if (dev_replace_is_ongoing && + (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) && dev_replace->tgtdev != NULL) { int index_where_to_add; u64 srcdev_devid = dev_replace->srcdev->devid; @@ -5687,7 +5817,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) && + } else if (dev_replace_is_ongoing && (op == REQ_GET_READ_MIRRORS) && dev_replace->tgtdev != NULL) { u64 srcdev_devid = dev_replace->srcdev->devid; int index_srcdev = 0; @@ -5718,20 +5848,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - if (physical_of_found + map->stripe_len <= - dev_replace->cursor_left) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; - tgtdev_indexes++; - num_stripes++; - } + tgtdev_indexes++; + num_stripes++; } } @@ -5762,21 +5889,21 @@ out: return ret; } -int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { - return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, 0); } /* For Scrub/replace */ -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) { - return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, mirror_num, need_raid_map); } @@ -5890,13 +6017,13 @@ static void btrfs_end_bio(struct bio *bio) BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; if (dev->bdev) { - if (bio->bi_rw & WRITE) + if (bio_op(bio) == REQ_OP_WRITE) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_WRITE_ERRS); else btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_READ_ERRS); - if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH) + if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_FLUSH_ERRS); btrfs_dev_stat_print_on_error(dev); @@ -5944,7 +6071,7 @@ static void btrfs_end_bio(struct bio *bio) */ static noinline void btrfs_schedule_bio(struct btrfs_root *root, struct btrfs_device *device, - int rw, struct bio *bio) + struct bio *bio) { int should_queue = 1; struct btrfs_pending_bios *pending_bios; @@ -5955,9 +6082,9 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, } /* don't bother with additional async steps for reads, right now */ - if (!(rw & REQ_WRITE)) { + if (bio_op(bio) == REQ_OP_READ) { bio_get(bio); - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); bio_put(bio); return; } @@ -5971,10 +6098,9 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, atomic_inc(&root->fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; - bio->bi_rw |= rw; spin_lock(&device->io_lock); - if (bio->bi_rw & REQ_SYNC) + if (bio->bi_opf & REQ_SYNC) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; @@ -5997,7 +6123,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, struct bio *bio, u64 physical, int dev_nr, - int rw, int async) + int async) { struct btrfs_device *dev = bbio->stripes[dev_nr].dev; @@ -6011,8 +6137,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, rcu_read_lock(); name = rcu_dereference(dev->name); - pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " - "(%s id %llu), size=%u\n", rw, + pr_debug("btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu " + "(%s id %llu), size=%u\n", bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, name->str, dev->devid, bio->bi_iter.bi_size); rcu_read_unlock(); @@ -6023,16 +6149,16 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, btrfs_bio_counter_inc_noblocked(root->fs_info); if (async) - btrfs_schedule_bio(root, dev, rw, bio); + btrfs_schedule_bio(root, dev, bio); else - btrfsic_submit_bio(rw, bio); + btrfsic_submit_bio(bio); } static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) { atomic_inc(&bbio->error); if (atomic_dec_and_test(&bbio->stripes_pending)) { - /* Shoud be the original bio. */ + /* Should be the original bio. */ WARN_ON(bio != bbio->orig_bio); btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; @@ -6042,7 +6168,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } } -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, +int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, int mirror_num, int async_submit) { struct btrfs_device *dev; @@ -6059,8 +6185,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, map_length = length; btrfs_bio_counter_inc_blocked(root->fs_info); - ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num, 1); + ret = __btrfs_map_block(root->fs_info, bio_op(bio), logical, + &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(root->fs_info); return ret; @@ -6074,10 +6200,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - ((rw & WRITE) || (mirror_num > 1))) { + ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ - if (rw & WRITE) { + if (bio_op(bio) == REQ_OP_WRITE) { ret = raid56_parity_write(root, bio, bbio, map_length); } else { ret = raid56_parity_recover(root, bio, bbio, map_length, @@ -6096,7 +6222,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; - if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { + if (!dev || !dev->bdev || + (bio_op(bio) == REQ_OP_WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); continue; } @@ -6108,7 +6235,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bio = first_bio; submit_stripe_bio(root, bbio, bio, - bbio->stripes[dev_nr].physical, dev_nr, rw, + bbio->stripes[dev_nr].physical, dev_nr, async_submit); } btrfs_bio_counter_dec(root->fs_info); @@ -6206,27 +6333,23 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, return dev; } -static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, - struct extent_buffer *leaf, - struct btrfs_chunk *chunk) +/* Return -EIO if any error, otherwise return 0. */ +static int btrfs_check_chunk_valid(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk, u64 logical) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; - u64 logical; u64 length; u64 stripe_len; - u64 devid; - u8 uuid[BTRFS_UUID_SIZE]; - int num_stripes; - int ret; - int i; + u16 num_stripes; + u16 sub_stripes; + u64 type; - logical = key->offset; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - /* Validation check */ + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); + if (!num_stripes) { btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", num_stripes); @@ -6237,24 +6360,70 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, "invalid chunk logical %llu", logical); return -EIO; } + if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) { + btrfs_err(root->fs_info, "invalid chunk sectorsize %u", + btrfs_chunk_sector_size(leaf, chunk)); + return -EIO; + } if (!length || !IS_ALIGNED(length, root->sectorsize)) { btrfs_err(root->fs_info, "invalid chunk length %llu", length); return -EIO; } - if (!is_power_of_2(stripe_len)) { + if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", stripe_len); return -EIO; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & - btrfs_chunk_type(leaf, chunk)) { + type) { btrfs_err(root->fs_info, "unrecognized chunk type: %llu", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); return -EIO; } + if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || + (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) || + (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || + (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || + (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || + ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && + num_stripes != 1)) { + btrfs_err(root->fs_info, + "invalid num_stripes:sub_stripes %u:%u for profile %llu", + num_stripes, sub_stripes, + type & BTRFS_BLOCK_GROUP_PROFILE_MASK); + return -EIO; + } + + return 0; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 stripe_len; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + ret = btrfs_check_chunk_valid(root, leaf, chunk, logical); + if (ret) + return ret; read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6301,7 +6470,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, BTRFS_UUID_SIZE); map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, uuid, NULL); - if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + if (!map->stripes[i].dev && + !btrfs_test_opt(root->fs_info, DEGRADED)) { free_extent_map(em); return -EIO; } @@ -6369,7 +6539,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, fs_devices = find_fsid(fsid); if (!fs_devices) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return ERR_PTR(-ENOENT); fs_devices = alloc_fs_devices(fsid); @@ -6431,7 +6601,7 @@ static int read_one_dev(struct btrfs_root *root, device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); if (!device) { - if (!btrfs_test_opt(root, DEGRADED)) + if (!btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; device = add_missing_dev(root, fs_devices, devid, dev_uuid); @@ -6440,7 +6610,7 @@ static int read_one_dev(struct btrfs_root *root, btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", devid, dev_uuid); } else { - if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) + if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED)) return -EIO; if(!device->bdev && !device->missing) { @@ -6502,6 +6672,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) u32 array_size; u32 len = 0; u32 cur_offset; + u64 type; struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); @@ -6511,12 +6682,12 @@ int btrfs_read_sys_array(struct btrfs_root *root) * overallocate but we can keep it as-is, only the first page is used. */ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); - if (!sb) - return -ENOMEM; + if (IS_ERR(sb)) + return PTR_ERR(sb); set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* - * The sb extent buffer is artifical and just used to read the system array. + * The sb extent buffer is artificial and just used to read the system array. * set_extent_buffer_uptodate() call does not properly mark all it's * pages up-to-date when the page is larger: extent does not cover the * whole page and consequently check_page_uptodate does not find all @@ -6568,6 +6739,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) break; } + type = btrfs_chunk_type(sb, chunk); + if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) { + btrfs_err(root->fs_info, + "invalid chunk type %llu in sys_array at offset %u", + type, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; @@ -6586,13 +6766,15 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb_array_offset += len; cur_offset += len; } - free_extent_buffer(sb); + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); return ret; out_short_read: printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", len, cur_offset); - free_extent_buffer(sb); + clear_extent_buffer_uptodate(sb); + free_extent_buffer_stale(sb); return -EIO; } @@ -6604,6 +6786,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) struct btrfs_key found_key; int ret; int slot; + u64 total_dev = 0; root = root->fs_info->chunk_root; @@ -6645,6 +6828,7 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) ret = read_one_dev(root, leaf, dev_item); if (ret) goto error; + total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -6654,6 +6838,28 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) } path->slots[0]++; } + + /* + * After loading chunk tree, we've got all device information, + * do another round of validation checks. + */ + if (total_dev != root->fs_info->fs_devices->total_devices) { + btrfs_err(root->fs_info, + "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_super_num_devices(root->fs_info->super_copy), + total_dev); + ret = -EINVAL; + goto error; + } + if (btrfs_super_total_bytes(root->fs_info->super_copy) < + root->fs_info->fs_devices->total_rw_bytes) { + btrfs_err(root->fs_info, + "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", + btrfs_super_total_bytes(root->fs_info->super_copy), + root->fs_info->fs_devices->total_rw_bytes); + ret = -EINVAL; + goto error; + } ret = 0; error: unlock_chunks(root); @@ -7007,38 +7213,3 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } - -static void btrfs_close_one_device(struct btrfs_device *device) -{ - struct btrfs_fs_devices *fs_devices = device->fs_devices; - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->missing) - fs_devices->missing_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); - } - - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; - - call_rcu(&device->rcu, free_device); -} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1939ebde63df..6613e6335ca2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -340,14 +340,14 @@ struct btrfs_raid_attr { }; extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; - +extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES]; extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; struct map_lookup { u64 type; int io_align; int io_width; - int stripe_len; + u64 stripe_len; int sector_size; int num_stripes; int sub_stripes; @@ -357,52 +357,6 @@ struct map_lookup { #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) -/* - * Restriper's general type filter - */ -#define BTRFS_BALANCE_DATA (1ULL << 0) -#define BTRFS_BALANCE_SYSTEM (1ULL << 1) -#define BTRFS_BALANCE_METADATA (1ULL << 2) - -#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \ - BTRFS_BALANCE_SYSTEM | \ - BTRFS_BALANCE_METADATA) - -#define BTRFS_BALANCE_FORCE (1ULL << 3) -#define BTRFS_BALANCE_RESUME (1ULL << 4) - -/* - * Balance filters - */ -#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0) -#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1) -#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2) -#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) -#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) -#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) -#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) -#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) -#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10) - -#define BTRFS_BALANCE_ARGS_MASK \ - (BTRFS_BALANCE_ARGS_PROFILES | \ - BTRFS_BALANCE_ARGS_USAGE | \ - BTRFS_BALANCE_ARGS_DEVID | \ - BTRFS_BALANCE_ARGS_DRANGE | \ - BTRFS_BALANCE_ARGS_VRANGE | \ - BTRFS_BALANCE_ARGS_LIMIT | \ - BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ - BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ - BTRFS_BALANCE_ARGS_USAGE_RANGE) - -/* - * Profile changing flags. When SOFT is set we won't relocate chunk if - * it already has the target profile (even though it may be - * half-filled). - */ -#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8) -#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9) - struct btrfs_balance_args; struct btrfs_balance_progress; struct btrfs_balance_control { @@ -421,10 +375,10 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); void btrfs_get_bbio(struct btrfs_bio *bbio); void btrfs_put_bbio(struct btrfs_bio *bbio); -int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); @@ -437,7 +391,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, +int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); @@ -445,13 +399,18 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); +void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, + struct btrfs_device *device, struct btrfs_device *this_dev); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); +int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, + char *devpath, + struct btrfs_device **device); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); -int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 03224b00ea70..d1a177a3dbe8 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -237,6 +237,9 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + if (btrfs_root_readonly(root)) + return -EROFS; + if (trans) return do_setxattr(trans, inode, name, value, size, flags); @@ -377,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, - int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { name = xattr_full_name(handler, name); - return btrfs_set_prop(d_inode(dentry), name, value, size, flags); + return btrfs_set_prop(inode, name, value, size, flags); } static const struct xattr_handler btrfs_security_xattr_handler = { @@ -432,25 +433,6 @@ const struct xattr_handler *btrfs_xattr_handlers[] = { NULL, }; -int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - - if (btrfs_root_readonly(root)) - return -EROFS; - return generic_setxattr(dentry, name, value, size, flags); -} - -int btrfs_removexattr(struct dentry *dentry, const char *name) -{ - struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - - if (btrfs_root_readonly(root)) - return -EROFS; - return generic_removexattr(dentry, name); -} - static int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 96807b3d22f5..15fc4743dc70 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -28,9 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags); -extern int btrfs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags); -extern int btrfs_removexattr(struct dentry *dentry, const char *name); extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, |