diff options
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r-- | fs/btrfs/disk-io.c | 438 |
1 files changed, 251 insertions, 187 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fdce8799b98d..98b6a71decba 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,6 +29,8 @@ #include <linux/crc32c.h> #include <linux/slab.h> #include <linux/migrate.h> +#include <linux/ratelimit.h> +#include <asm/unaligned.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -40,6 +42,7 @@ #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" +#include "inode-map.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -136,7 +139,7 @@ static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { * that covers the entire device */ static struct extent_map *btree_get_extent(struct inode *inode, - struct page *page, size_t page_offset, u64 start, u64 len, + struct page *page, size_t pg_offset, u64 start, u64 len, int create) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; @@ -153,7 +156,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, } read_unlock(&em_tree->lock); - em = alloc_extent_map(GFP_NOFS); + em = alloc_extent_map(); if (!em) { em = ERR_PTR(-ENOMEM); goto out; @@ -198,7 +201,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) void btrfs_csum_final(u32 crc, char *result) { - *(__le32 *)result = ~cpu_to_le32(crc); + put_unaligned_le32(~crc, result); } /* @@ -253,14 +256,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - if (printk_ratelimit()) { - printk(KERN_INFO "btrfs: %s checksum verify " + printk_ratelimited(KERN_INFO "btrfs: %s checksum verify " "failed on %llu wanted %X found %X " "level %d\n", root->fs_info->sb->s_id, (unsigned long long)buf->start, val, found, btrfs_header_level(buf)); - } if (result != (char *)&inline_result) kfree(result); return 1; @@ -295,13 +296,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - if (printk_ratelimit()) { - printk("parent transid verify failed on %llu wanted %llu " + printk_ratelimited("parent transid verify failed on %llu wanted %llu " "found %llu\n", (unsigned long long)eb->start, (unsigned long long)parent_transid, (unsigned long long)btrfs_header_generation(eb)); - } ret = 1; clear_extent_buffer_uptodate(io_tree, eb, &cached_state); out: @@ -323,6 +322,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, int num_copies = 0; int mirror_num = 0; + clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; while (1) { ret = read_extent_buffer_pages(io_tree, eb, start, 1, @@ -331,6 +331,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, !verify_parent_transid(io_tree, eb, parent_transid)) return ret; + /* + * This buffer's crc is fine, but its contents are corrupted, so + * there is no reason to read the other copies, they won't be + * any less wrong. + */ + if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) + return ret; + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) @@ -359,14 +367,18 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) tree = &BTRFS_I(page->mapping->host)->io_tree; - if (page->private == EXTENT_PAGE_PRIVATE) + if (page->private == EXTENT_PAGE_PRIVATE) { + WARN_ON(1); goto out; - if (!page->private) + } + if (!page->private) { + WARN_ON(1); goto out; + } len = page->private >> 2; WARN_ON(len == 0); - eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + eb = alloc_extent_buffer(tree, start, len, page); if (eb == NULL) { WARN_ON(1); goto out; @@ -415,6 +427,73 @@ static int check_tree_block_fsid(struct btrfs_root *root, return ret; } +#define CORRUPT(reason, eb, root, slot) \ + printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \ + "root=%llu, slot=%d\n", reason, \ + (unsigned long long)btrfs_header_bytenr(eb), \ + (unsigned long long)root->objectid, slot) + +static noinline int check_leaf(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + struct btrfs_key key; + struct btrfs_key leaf_key; + u32 nritems = btrfs_header_nritems(leaf); + int slot; + + if (nritems == 0) + return 0; + + /* Check the 0 item */ + if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("invalid item offset size pair", leaf, root, 0); + return -EIO; + } + + /* + * Check to make sure each items keys are in the correct order and their + * offsets make sense. We only have to loop through nritems-1 because + * we check the current slot against the next slot, which verifies the + * next slot's offset+size makes sense and that the current's slot + * offset is correct. + */ + for (slot = 0; slot < nritems - 1; slot++) { + btrfs_item_key_to_cpu(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &key, slot + 1); + + /* Make sure the keys are in the right order */ + if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) { + CORRUPT("bad key order", leaf, root, slot); + return -EIO; + } + + /* + * Make sure the offset and ends are right, remember that the + * item data starts at the end of the leaf and grows towards the + * front. + */ + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + CORRUPT("slot offset bad", leaf, root, slot); + return -EIO; + } + + /* + * Check to make sure that we don't point outside of the leaf, + * just incase all the items are consistent to eachother, but + * all point outside of the leaf. + */ + if (btrfs_item_end_nr(leaf, slot) > + BTRFS_LEAF_DATA_SIZE(root)) { + CORRUPT("slot end outside of leaf", leaf, root, slot); + return -EIO; + } + } + + return 0; +} + #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) { @@ -444,7 +523,7 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, len = page->private >> 2; WARN_ON(len == 0); - eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + eb = alloc_extent_buffer(tree, start, len, page); if (eb == NULL) { ret = -EIO; goto out; @@ -452,12 +531,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, found_start = btrfs_header_bytenr(eb); if (found_start != start) { - if (printk_ratelimit()) { - printk(KERN_INFO "btrfs bad tree block start " + printk_ratelimited(KERN_INFO "btrfs bad tree block start " "%llu %llu\n", (unsigned long long)found_start, (unsigned long long)eb->start); - } ret = -EIO; goto err; } @@ -469,10 +546,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto err; } if (check_tree_block_fsid(root, eb)) { - if (printk_ratelimit()) { - printk(KERN_INFO "btrfs bad fsid on block %llu\n", + printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", (unsigned long long)eb->start); - } ret = -EIO; goto err; } @@ -481,8 +556,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, btrfs_set_buffer_lockdep_class(eb, found_level); ret = csum_tree_block(root, eb, 1); - if (ret) + if (ret) { ret = -EIO; + goto err; + } + + /* + * If this is a leaf block and it is corrupt, set the corrupt bit so + * that we don't try and read the other copies of this block, just + * return -EIO. + */ + if (found_level == 0 && check_leaf(root, eb)) { + set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + ret = -EIO; + } end = min_t(u64, eb->len, PAGE_CACHE_SIZE); end = eb->start + end - 1; @@ -557,12 +644,6 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) return 256 * limit; } -int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) -{ - return atomic_read(&info->nr_async_bios) > - btrfs_async_submit_limit(info); -} - static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; @@ -843,7 +924,6 @@ static const struct address_space_operations btree_aops = { .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, - .sync_page = block_sync_page, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif @@ -871,7 +951,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, struct inode *btree_inode = root->fs_info->btree_inode; struct extent_buffer *eb; eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize, GFP_NOFS); + bytenr, blocksize); return eb; } @@ -882,7 +962,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, struct extent_buffer *eb; eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, - bytenr, blocksize, NULL, GFP_NOFS); + bytenr, blocksize, NULL); return eb; } @@ -966,13 +1046,13 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->name = NULL; root->in_sysfs = 0; root->inode_tree = RB_ROOT; + INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->orphan_list); INIT_LIST_HEAD(&root->root_list); - spin_lock_init(&root->node_lock); spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); spin_lock_init(&root->accounting_lock); @@ -988,7 +1068,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->log_transid = 0; root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, - fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->btree_inode->i_mapping); memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); @@ -1156,7 +1236,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root, fs_info, location->objectid); path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + kfree(root); + return ERR_PTR(-ENOMEM); + } ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); if (ret == 0) { l = path->nodes[0]; @@ -1180,27 +1263,14 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); out: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { root->ref_cows = 1; + btrfs_check_and_init_root_item(&root->root_item); + } return root; } -struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, - u64 root_objectid) -{ - struct btrfs_root *root; - - if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) - return fs_info->tree_root; - if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) - return fs_info->extent_root; - - root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)root_objectid); - return root; -} - struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, struct btrfs_key *location) { @@ -1229,6 +1299,19 @@ again: if (IS_ERR(root)) return root; + root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); + if (!root->free_ino_ctl) + goto fail; + root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), + GFP_NOFS); + if (!root->free_ino_pinned) + goto fail; + + btrfs_init_free_ino_ctl(root); + mutex_init(&root->fs_commit_mutex); + spin_lock_init(&root->cache_lock); + init_waitqueue_head(&root->cache_wait); + set_anon_super(&root->anon_super, NULL); if (btrfs_root_refs(&root->root_item) == 0) { @@ -1272,41 +1355,6 @@ fail: return ERR_PTR(ret); } -struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *location, - const char *name, int namelen) -{ - return btrfs_read_fs_root_no_name(fs_info, location); -#if 0 - struct btrfs_root *root; - int ret; - - root = btrfs_read_fs_root_no_name(fs_info, location); - if (!root) - return NULL; - - if (root->in_sysfs) - return root; - - ret = btrfs_set_root_name(root, name, namelen); - if (ret) { - free_extent_buffer(root->node); - kfree(root); - return ERR_PTR(ret); - } - - ret = btrfs_sysfs_add_root(root); - if (ret) { - free_extent_buffer(root->node); - kfree(root->name); - kfree(root); - return ERR_PTR(ret); - } - root->in_sysfs = 1; - return root; -#endif -} - static int btrfs_congested_fn(void *congested_data, int bdi_bits) { struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; @@ -1314,7 +1362,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) struct btrfs_device *device; struct backing_dev_info *bdi; - list_for_each_entry(device, &info->fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); @@ -1323,86 +1372,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) break; } } + rcu_read_unlock(); return ret; } /* - * this unplugs every device on the box, and it is only used when page - * is null - */ -static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ - struct btrfs_device *device; - struct btrfs_fs_info *info; - - info = (struct btrfs_fs_info *)bdi->unplug_io_data; - list_for_each_entry(device, &info->fs_devices->devices, dev_list) { - if (!device->bdev) - continue; - - bdi = blk_get_backing_dev_info(device->bdev); - if (bdi->unplug_io_fn) - bdi->unplug_io_fn(bdi, page); - } -} - -static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ - struct inode *inode; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct address_space *mapping; - u64 offset; - - /* the generic O_DIRECT read code does this */ - if (1 || !page) { - __unplug_io_fn(bdi, page); - return; - } - - /* - * page->mapping may change at any time. Get a consistent copy - * and use that for everything below - */ - smp_mb(); - mapping = page->mapping; - if (!mapping) - return; - - inode = mapping->host; - - /* - * don't do the expensive searching for a small number of - * devices - */ - if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) { - __unplug_io_fn(bdi, page); - return; - } - - offset = page_offset(page); - - em_tree = &BTRFS_I(inode)->extent_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); - read_unlock(&em_tree->lock); - if (!em) { - __unplug_io_fn(bdi, page); - return; - } - - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - __unplug_io_fn(bdi, page); - return; - } - offset = offset - em->start; - btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, - em->block_start + offset, page); - free_extent_map(em); -} - -/* * If this fails, caller must call bdi_destroy() to get rid of the * bdi again. */ @@ -1416,8 +1390,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) return err; bdi->ra_pages = default_backing_dev_info.ra_pages; - bdi->unplug_io_fn = btrfs_unplug_io_fn; - bdi->unplug_io_data = info; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; return 0; @@ -1503,6 +1475,7 @@ static int cleaner_kthread(void *arg) btrfs_run_delayed_iputs(root); btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); + btrfs_run_defrag_inodes(root->fs_info); } if (freezing(current)) { @@ -1592,7 +1565,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_fs_info *fs_info = NULL; struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), @@ -1604,11 +1577,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, struct btrfs_super_block *disk_super; - if (!extent_root || !tree_root || !fs_info || + if (!extent_root || !tree_root || !tree_root->fs_info || !chunk_root || !dev_root || !csum_root) { err = -ENOMEM; goto fail; } + fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1628,6 +1602,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_bdi; } + fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); @@ -1641,6 +1617,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); + spin_lock_init(&fs_info->defrag_inodes_lock); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; @@ -1663,15 +1640,35 @@ struct btrfs_root *open_ctree(struct super_block *sb, atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); + atomic_set(&fs_info->defrag_running, 0); fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; + fs_info->defrag_inodes = RB_ROOT; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); INIT_LIST_HEAD(&fs_info->ordered_extents); spin_lock_init(&fs_info->ordered_extent_lock); + fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), + GFP_NOFS); + if (!fs_info->delayed_root) { + err = -ENOMEM; + goto fail_iput; + } + btrfs_init_delayed_root(fs_info->delayed_root); + + mutex_init(&fs_info->scrub_lock); + atomic_set(&fs_info->scrubs_running, 0); + atomic_set(&fs_info->scrub_pause_req, 0); + atomic_set(&fs_info->scrubs_paused, 0); + atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->scrub_pause_wait); + init_rwsem(&fs_info->scrub_super_lock); + fs_info->scrub_workers_refcnt = 0; + btrfs_init_workers(&fs_info->scrub_workers, "scrub", + fs_info->thread_pool_size, &fs_info->generic_worker); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); @@ -1690,10 +1687,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, - fs_info->btree_inode->i_mapping, - GFP_NOFS); - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, - GFP_NOFS); + fs_info->btree_inode->i_mapping); + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; @@ -1707,9 +1702,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->block_group_cache_tree = RB_ROOT; extent_io_tree_init(&fs_info->freed_extents[0], - fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->btree_inode->i_mapping); extent_io_tree_init(&fs_info->freed_extents[1], - fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->btree_inode->i_mapping); fs_info->pinned_extents = &fs_info->freed_extents[0]; fs_info->do_barriers = 1; @@ -1739,7 +1734,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, bh = btrfs_read_dev_super(fs_devices->latest_bdev); if (!bh) { err = -EINVAL; - goto fail_iput; + goto fail_alloc; } memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); @@ -1751,17 +1746,23 @@ struct btrfs_root *open_ctree(struct super_block *sb, disk_super = &fs_info->super_copy; if (!btrfs_super_root(disk_super)) - goto fail_iput; + goto fail_alloc; /* check FS state, whether FS is broken. */ fs_info->fs_state |= btrfs_super_flags(disk_super); btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); + /* + * In the long term, we'll store the compression type in the super + * block, and it'll be used for per file compression control. + */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + ret = btrfs_parse_options(tree_root, options); if (ret) { err = ret; - goto fail_iput; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super) & @@ -1771,7 +1772,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, "unsupported optional features (%Lx).\n", (unsigned long long)features); err = -EINVAL; - goto fail_iput; + goto fail_alloc; } features = btrfs_super_incompat_flags(disk_super); @@ -1787,7 +1788,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, "unsupported option features (%Lx).\n", (unsigned long long)features); err = -EINVAL; - goto fail_iput; + goto fail_alloc; } btrfs_init_workers(&fs_info->generic_worker, @@ -1834,6 +1835,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", 1, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1855,6 +1859,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); btrfs_start_workers(&fs_info->endio_freespace_worker, 1); + btrfs_start_workers(&fs_info->delayed_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1963,6 +1968,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->metadata_alloc_profile = (u64)-1; fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + ret = btrfs_init_space_info(fs_info); + if (ret) { + printk(KERN_ERR "Failed to initial space info: %d\n", ret); + goto fail_block_groups; + } + ret = btrfs_read_block_groups(extent_root); if (ret) { printk(KERN_ERR "Failed to read block groups: %d\n", ret); @@ -2054,9 +2065,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY)) { down_read(&fs_info->cleanup_work_sem); - btrfs_orphan_cleanup(fs_info->fs_root); - btrfs_orphan_cleanup(fs_info->tree_root); + err = btrfs_orphan_cleanup(fs_info->fs_root); + if (!err) + err = btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); + if (err) { + close_ctree(tree_root); + return ERR_PTR(err); + } } return tree_root; @@ -2100,6 +2116,9 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->delayed_workers); +fail_alloc: + kfree(fs_info->delayed_root); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2127,11 +2146,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (printk_ratelimit()) { - printk(KERN_WARNING "lost page write due to " + printk_ratelimited(KERN_WARNING "lost page write due to " "I/O error on %s\n", bdevname(bh->b_bdev, b)); - } /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors */ @@ -2295,7 +2312,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; - list_for_each_entry(dev, head, dev_list) { + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; continue; @@ -2328,7 +2345,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) } total_errors = 0; - list_for_each_entry(dev, head, dev_list) { + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) continue; if (!dev->in_fs_metadata || !dev->writeable) @@ -2366,12 +2383,15 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) if (btrfs_root_refs(&root->root_item) == 0) synchronize_srcu(&fs_info->subvol_srcu); + __btrfs_remove_free_space_cache(root->free_ino_pinned); + __btrfs_remove_free_space_cache(root->free_ino_ctl); free_fs_root(root); return 0; } static void free_fs_root(struct btrfs_root *root) { + iput(root->cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); if (root->anon_super.s_dev) { down_write(&root->anon_super.s_umount); @@ -2379,6 +2399,8 @@ static void free_fs_root(struct btrfs_root *root) } free_extent_buffer(root->node); free_extent_buffer(root->commit_root); + kfree(root->free_ino_ctl); + kfree(root->free_ino_pinned); kfree(root->name); kfree(root); } @@ -2431,8 +2453,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { + int err; + root_objectid = gang[i]->root_key.objectid; - btrfs_orphan_cleanup(gang[i]); + err = btrfs_orphan_cleanup(gang[i]); + if (err) + return err; } root_objectid++; } @@ -2478,6 +2504,15 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + btrfs_scrub_cancel(root); + + /* wait for any defraggers to finish */ + wait_event(fs_info->transaction_wait, + (atomic_read(&fs_info->defrag_running) == 0)); + + /* clear out the rbtree of defraggable inodes */ + btrfs_run_defrag_inodes(root->fs_info); + btrfs_put_block_group_cache(fs_info); /* @@ -2489,7 +2524,7 @@ int close_ctree(struct btrfs_root *root) * ERROR state on disk. * * 2. when btrfs flips readonly just in btrfs_commit_super, - * and in such case, btrfs cannnot write sb via btrfs_commit_super, + * and in such case, btrfs cannot write sb via btrfs_commit_super, * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, * btrfs will cleanup all FS resources first and write sb then. */ @@ -2536,6 +2571,7 @@ int close_ctree(struct btrfs_root *root) del_fs_roots(fs_info); iput(fs_info->btree_inode); + kfree(fs_info->delayed_root); btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); @@ -2547,6 +2583,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->delayed_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); @@ -2623,6 +2660,29 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) if (current->flags & PF_MEMALLOC) return; + btrfs_balance_delayed_items(root); + + num_dirty = root->fs_info->dirty_metadata_bytes; + + if (num_dirty > thresh) { + balance_dirty_pages_ratelimited_nr( + root->fs_info->btree_inode->i_mapping, 1); + } + return; +} + +void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +{ + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + u64 num_dirty; + unsigned long thresh = 32 * 1024 * 1024; + + if (current->flags & PF_MEMALLOC) + return; + num_dirty = root->fs_info->dirty_metadata_bytes; if (num_dirty > thresh) { @@ -2655,7 +2715,7 @@ int btree_lock_page_hook(struct page *page) goto out; len = page->private >> 2; - eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); + eb = find_extent_buffer(io_tree, bytenr, len); if (!eb) goto out; @@ -2782,6 +2842,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_lock(&delayed_refs->lock); if (delayed_refs->num_entries == 0) { + spin_unlock(&delayed_refs->lock); printk(KERN_INFO "delayed_refs has NO entry\n"); return ret; } @@ -2943,7 +3004,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root, break; /* opt_discard */ - ret = btrfs_error_discard_extent(root, start, end + 1 - start); + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_error_discard_extent(root, start, + end + 1 - start, + NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); btrfs_error_unpin_extent_range(root, start, end); @@ -3012,7 +3076,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); - t->use_count = 0; + atomic_set(&t->use_count, 0); list_del_init(&t->list); memset(t, 0, sizeof(*t)); kmem_cache_free(btrfs_transaction_cachep, t); |