summaryrefslogtreecommitdiff
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c438
1 files changed, 251 insertions, 187 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fdce8799b98d..98b6a71decba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,6 +29,8 @@
#include <linux/crc32c.h>
#include <linux/slab.h>
#include <linux/migrate.h>
+#include <linux/ratelimit.h>
+#include <asm/unaligned.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -40,6 +42,7 @@
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
+#include "inode-map.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -136,7 +139,7 @@ static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
* that covers the entire device
*/
static struct extent_map *btree_get_extent(struct inode *inode,
- struct page *page, size_t page_offset, u64 start, u64 len,
+ struct page *page, size_t pg_offset, u64 start, u64 len,
int create)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -153,7 +156,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
}
read_unlock(&em_tree->lock);
- em = alloc_extent_map(GFP_NOFS);
+ em = alloc_extent_map();
if (!em) {
em = ERR_PTR(-ENOMEM);
goto out;
@@ -198,7 +201,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
void btrfs_csum_final(u32 crc, char *result)
{
- *(__le32 *)result = ~cpu_to_le32(crc);
+ put_unaligned_le32(~crc, result);
}
/*
@@ -253,14 +256,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- if (printk_ratelimit()) {
- printk(KERN_INFO "btrfs: %s checksum verify "
+ printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
"failed on %llu wanted %X found %X "
"level %d\n",
root->fs_info->sb->s_id,
(unsigned long long)buf->start, val, found,
btrfs_header_level(buf));
- }
if (result != (char *)&inline_result)
kfree(result);
return 1;
@@ -295,13 +296,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- if (printk_ratelimit()) {
- printk("parent transid verify failed on %llu wanted %llu "
+ printk_ratelimited("parent transid verify failed on %llu wanted %llu "
"found %llu\n",
(unsigned long long)eb->start,
(unsigned long long)parent_transid,
(unsigned long long)btrfs_header_generation(eb));
- }
ret = 1;
clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
out:
@@ -323,6 +322,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
int num_copies = 0;
int mirror_num = 0;
+ clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -331,6 +331,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
!verify_parent_transid(io_tree, eb, parent_transid))
return ret;
+ /*
+ * This buffer's crc is fine, but its contents are corrupted, so
+ * there is no reason to read the other copies, they won't be
+ * any less wrong.
+ */
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+ return ret;
+
num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
eb->start, eb->len);
if (num_copies == 1)
@@ -359,14 +367,18 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE)
+ if (page->private == EXTENT_PAGE_PRIVATE) {
+ WARN_ON(1);
goto out;
- if (!page->private)
+ }
+ if (!page->private) {
+ WARN_ON(1);
goto out;
+ }
len = page->private >> 2;
WARN_ON(len == 0);
- eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ eb = alloc_extent_buffer(tree, start, len, page);
if (eb == NULL) {
WARN_ON(1);
goto out;
@@ -415,6 +427,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
return ret;
}
+#define CORRUPT(reason, eb, root, slot) \
+ printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
+ "root=%llu, slot=%d\n", reason, \
+ (unsigned long long)btrfs_header_bytenr(eb), \
+ (unsigned long long)root->objectid, slot)
+
+static noinline int check_leaf(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_key leaf_key;
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+ if (nritems == 0)
+ return 0;
+
+ /* Check the 0 item */
+ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("invalid item offset size pair", leaf, root, 0);
+ return -EIO;
+ }
+
+ /*
+ * Check to make sure each items keys are in the correct order and their
+ * offsets make sense. We only have to loop through nritems-1 because
+ * we check the current slot against the next slot, which verifies the
+ * next slot's offset+size makes sense and that the current's slot
+ * offset is correct.
+ */
+ for (slot = 0; slot < nritems - 1; slot++) {
+ btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+ btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+
+ /* Make sure the keys are in the right order */
+ if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+ CORRUPT("bad key order", leaf, root, slot);
+ return -EIO;
+ }
+
+ /*
+ * Make sure the offset and ends are right, remember that the
+ * item data starts at the end of the leaf and grows towards the
+ * front.
+ */
+ if (btrfs_item_offset_nr(leaf, slot) !=
+ btrfs_item_end_nr(leaf, slot + 1)) {
+ CORRUPT("slot offset bad", leaf, root, slot);
+ return -EIO;
+ }
+
+ /*
+ * Check to make sure that we don't point outside of the leaf,
+ * just incase all the items are consistent to eachother, but
+ * all point outside of the leaf.
+ */
+ if (btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("slot end outside of leaf", leaf, root, slot);
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
{
@@ -444,7 +523,7 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
len = page->private >> 2;
WARN_ON(len == 0);
- eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ eb = alloc_extent_buffer(tree, start, len, page);
if (eb == NULL) {
ret = -EIO;
goto out;
@@ -452,12 +531,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
- if (printk_ratelimit()) {
- printk(KERN_INFO "btrfs bad tree block start "
+ printk_ratelimited(KERN_INFO "btrfs bad tree block start "
"%llu %llu\n",
(unsigned long long)found_start,
(unsigned long long)eb->start);
- }
ret = -EIO;
goto err;
}
@@ -469,10 +546,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- if (printk_ratelimit()) {
- printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+ printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
(unsigned long long)eb->start);
- }
ret = -EIO;
goto err;
}
@@ -481,8 +556,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
btrfs_set_buffer_lockdep_class(eb, found_level);
ret = csum_tree_block(root, eb, 1);
- if (ret)
+ if (ret) {
ret = -EIO;
+ goto err;
+ }
+
+ /*
+ * If this is a leaf block and it is corrupt, set the corrupt bit so
+ * that we don't try and read the other copies of this block, just
+ * return -EIO.
+ */
+ if (found_level == 0 && check_leaf(root, eb)) {
+ set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ ret = -EIO;
+ }
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
@@ -557,12 +644,6 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
return 256 * limit;
}
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
-{
- return atomic_read(&info->nr_async_bios) >
- btrfs_async_submit_limit(info);
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -843,7 +924,6 @@ static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
- .sync_page = block_sync_page,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
@@ -871,7 +951,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
struct inode *btree_inode = root->fs_info->btree_inode;
struct extent_buffer *eb;
eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
- bytenr, blocksize, GFP_NOFS);
+ bytenr, blocksize);
return eb;
}
@@ -882,7 +962,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
struct extent_buffer *eb;
eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
- bytenr, blocksize, NULL, GFP_NOFS);
+ bytenr, blocksize, NULL);
return eb;
}
@@ -966,13 +1046,13 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree = RB_ROOT;
+ INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
root->block_rsv = NULL;
root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
- spin_lock_init(&root->node_lock);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->accounting_lock);
@@ -988,7 +1068,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->log_transid = 0;
root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
- fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->btree_inode->i_mapping);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
@@ -1156,7 +1236,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
root, fs_info, location->objectid);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path) {
+ kfree(root);
+ return ERR_PTR(-ENOMEM);
+ }
ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
if (ret == 0) {
l = path->nodes[0];
@@ -1180,27 +1263,14 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
out:
- if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
+ if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
+ btrfs_check_and_init_root_item(&root->root_item);
+ }
return root;
}
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_objectid)
-{
- struct btrfs_root *root;
-
- if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
- return fs_info->tree_root;
- if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return fs_info->extent_root;
-
- root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)root_objectid);
- return root;
-}
-
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location)
{
@@ -1229,6 +1299,19 @@ again:
if (IS_ERR(root))
return root;
+ root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+ if (!root->free_ino_ctl)
+ goto fail;
+ root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+ GFP_NOFS);
+ if (!root->free_ino_pinned)
+ goto fail;
+
+ btrfs_init_free_ino_ctl(root);
+ mutex_init(&root->fs_commit_mutex);
+ spin_lock_init(&root->cache_lock);
+ init_waitqueue_head(&root->cache_wait);
+
set_anon_super(&root->anon_super, NULL);
if (btrfs_root_refs(&root->root_item) == 0) {
@@ -1272,41 +1355,6 @@ fail:
return ERR_PTR(ret);
}
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location,
- const char *name, int namelen)
-{
- return btrfs_read_fs_root_no_name(fs_info, location);
-#if 0
- struct btrfs_root *root;
- int ret;
-
- root = btrfs_read_fs_root_no_name(fs_info, location);
- if (!root)
- return NULL;
-
- if (root->in_sysfs)
- return root;
-
- ret = btrfs_set_root_name(root, name, namelen);
- if (ret) {
- free_extent_buffer(root->node);
- kfree(root);
- return ERR_PTR(ret);
- }
-
- ret = btrfs_sysfs_add_root(root);
- if (ret) {
- free_extent_buffer(root->node);
- kfree(root->name);
- kfree(root);
- return ERR_PTR(ret);
- }
- root->in_sysfs = 1;
- return root;
-#endif
-}
-
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
{
struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
@@ -1314,7 +1362,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
struct btrfs_device *device;
struct backing_dev_info *bdi;
- list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
if (!device->bdev)
continue;
bdi = blk_get_backing_dev_info(device->bdev);
@@ -1323,86 +1372,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
break;
}
}
+ rcu_read_unlock();
return ret;
}
/*
- * this unplugs every device on the box, and it is only used when page
- * is null
- */
-static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
- struct btrfs_device *device;
- struct btrfs_fs_info *info;
-
- info = (struct btrfs_fs_info *)bdi->unplug_io_data;
- list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
- if (!device->bdev)
- continue;
-
- bdi = blk_get_backing_dev_info(device->bdev);
- if (bdi->unplug_io_fn)
- bdi->unplug_io_fn(bdi, page);
- }
-}
-
-static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
- struct inode *inode;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct address_space *mapping;
- u64 offset;
-
- /* the generic O_DIRECT read code does this */
- if (1 || !page) {
- __unplug_io_fn(bdi, page);
- return;
- }
-
- /*
- * page->mapping may change at any time. Get a consistent copy
- * and use that for everything below
- */
- smp_mb();
- mapping = page->mapping;
- if (!mapping)
- return;
-
- inode = mapping->host;
-
- /*
- * don't do the expensive searching for a small number of
- * devices
- */
- if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
- __unplug_io_fn(bdi, page);
- return;
- }
-
- offset = page_offset(page);
-
- em_tree = &BTRFS_I(inode)->extent_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
- read_unlock(&em_tree->lock);
- if (!em) {
- __unplug_io_fn(bdi, page);
- return;
- }
-
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- __unplug_io_fn(bdi, page);
- return;
- }
- offset = offset - em->start;
- btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
- em->block_start + offset, page);
- free_extent_map(em);
-}
-
-/*
* If this fails, caller must call bdi_destroy() to get rid of the
* bdi again.
*/
@@ -1416,8 +1390,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
return err;
bdi->ra_pages = default_backing_dev_info.ra_pages;
- bdi->unplug_io_fn = btrfs_unplug_io_fn;
- bdi->unplug_io_data = info;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
return 0;
@@ -1503,6 +1475,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_delayed_iputs(root);
btrfs_clean_old_snapshots(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
+ btrfs_run_defrag_inodes(root->fs_info);
}
if (freezing(current)) {
@@ -1592,7 +1565,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_root *tree_root = btrfs_sb(sb);
- struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_fs_info *fs_info = NULL;
struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1604,11 +1577,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_super_block *disk_super;
- if (!extent_root || !tree_root || !fs_info ||
+ if (!extent_root || !tree_root || !tree_root->fs_info ||
!chunk_root || !dev_root || !csum_root) {
err = -ENOMEM;
goto fail;
}
+ fs_info = tree_root->fs_info;
ret = init_srcu_struct(&fs_info->subvol_srcu);
if (ret) {
@@ -1628,6 +1602,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_bdi;
}
+ fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
+
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1641,6 +1617,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
+ spin_lock_init(&fs_info->defrag_inodes_lock);
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
@@ -1663,15 +1640,35 @@ struct btrfs_root *open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
+ atomic_set(&fs_info->defrag_running, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
+ fs_info->defrag_inodes = RB_ROOT;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
INIT_LIST_HEAD(&fs_info->ordered_extents);
spin_lock_init(&fs_info->ordered_extent_lock);
+ fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+ GFP_NOFS);
+ if (!fs_info->delayed_root) {
+ err = -ENOMEM;
+ goto fail_iput;
+ }
+ btrfs_init_delayed_root(fs_info->delayed_root);
+
+ mutex_init(&fs_info->scrub_lock);
+ atomic_set(&fs_info->scrubs_running, 0);
+ atomic_set(&fs_info->scrub_pause_req, 0);
+ atomic_set(&fs_info->scrubs_paused, 0);
+ atomic_set(&fs_info->scrub_cancel_req, 0);
+ init_waitqueue_head(&fs_info->scrub_pause_wait);
+ init_rwsem(&fs_info->scrub_super_lock);
+ fs_info->scrub_workers_refcnt = 0;
+ btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+ fs_info->thread_pool_size, &fs_info->generic_worker);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
@@ -1690,10 +1687,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
- fs_info->btree_inode->i_mapping,
- GFP_NOFS);
- extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
- GFP_NOFS);
+ fs_info->btree_inode->i_mapping);
+ extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
@@ -1707,9 +1702,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->block_group_cache_tree = RB_ROOT;
extent_io_tree_init(&fs_info->freed_extents[0],
- fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->btree_inode->i_mapping);
extent_io_tree_init(&fs_info->freed_extents[1],
- fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->btree_inode->i_mapping);
fs_info->pinned_extents = &fs_info->freed_extents[0];
fs_info->do_barriers = 1;
@@ -1739,7 +1734,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
if (!bh) {
err = -EINVAL;
- goto fail_iput;
+ goto fail_alloc;
}
memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
@@ -1751,17 +1746,23 @@ struct btrfs_root *open_ctree(struct super_block *sb,
disk_super = &fs_info->super_copy;
if (!btrfs_super_root(disk_super))
- goto fail_iput;
+ goto fail_alloc;
/* check FS state, whether FS is broken. */
fs_info->fs_state |= btrfs_super_flags(disk_super);
btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ /*
+ * In the long term, we'll store the compression type in the super
+ * block, and it'll be used for per file compression control.
+ */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
ret = btrfs_parse_options(tree_root, options);
if (ret) {
err = ret;
- goto fail_iput;
+ goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super) &
@@ -1771,7 +1772,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
"unsupported optional features (%Lx).\n",
(unsigned long long)features);
err = -EINVAL;
- goto fail_iput;
+ goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super);
@@ -1787,7 +1788,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
"unsupported option features (%Lx).\n",
(unsigned long long)features);
err = -EINVAL;
- goto fail_iput;
+ goto fail_alloc;
}
btrfs_init_workers(&fs_info->generic_worker,
@@ -1834,6 +1835,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
&fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1, &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -1855,6 +1859,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
+ btrfs_start_workers(&fs_info->delayed_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1963,6 +1968,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->metadata_alloc_profile = (u64)-1;
fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+ ret = btrfs_init_space_info(fs_info);
+ if (ret) {
+ printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+ goto fail_block_groups;
+ }
+
ret = btrfs_read_block_groups(extent_root);
if (ret) {
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -2054,9 +2065,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!(sb->s_flags & MS_RDONLY)) {
down_read(&fs_info->cleanup_work_sem);
- btrfs_orphan_cleanup(fs_info->fs_root);
- btrfs_orphan_cleanup(fs_info->tree_root);
+ err = btrfs_orphan_cleanup(fs_info->fs_root);
+ if (!err)
+ err = btrfs_orphan_cleanup(fs_info->tree_root);
up_read(&fs_info->cleanup_work_sem);
+ if (err) {
+ close_ctree(tree_root);
+ return ERR_PTR(err);
+ }
}
return tree_root;
@@ -2100,6 +2116,9 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->delayed_workers);
+fail_alloc:
+ kfree(fs_info->delayed_root);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -2127,11 +2146,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- if (printk_ratelimit()) {
- printk(KERN_WARNING "lost page write due to "
+ printk_ratelimited(KERN_WARNING "lost page write due to "
"I/O error on %s\n",
bdevname(bh->b_bdev, b));
- }
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
*/
@@ -2295,7 +2312,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
- list_for_each_entry(dev, head, dev_list) {
+ list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
total_errors++;
continue;
@@ -2328,7 +2345,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
total_errors = 0;
- list_for_each_entry(dev, head, dev_list) {
+ list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev)
continue;
if (!dev->in_fs_metadata || !dev->writeable)
@@ -2366,12 +2383,15 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
if (btrfs_root_refs(&root->root_item) == 0)
synchronize_srcu(&fs_info->subvol_srcu);
+ __btrfs_remove_free_space_cache(root->free_ino_pinned);
+ __btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
return 0;
}
static void free_fs_root(struct btrfs_root *root)
{
+ iput(root->cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
if (root->anon_super.s_dev) {
down_write(&root->anon_super.s_umount);
@@ -2379,6 +2399,8 @@ static void free_fs_root(struct btrfs_root *root)
}
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
+ kfree(root->free_ino_ctl);
+ kfree(root->free_ino_pinned);
kfree(root->name);
kfree(root);
}
@@ -2431,8 +2453,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
root_objectid = gang[ret - 1]->root_key.objectid + 1;
for (i = 0; i < ret; i++) {
+ int err;
+
root_objectid = gang[i]->root_key.objectid;
- btrfs_orphan_cleanup(gang[i]);
+ err = btrfs_orphan_cleanup(gang[i]);
+ if (err)
+ return err;
}
root_objectid++;
}
@@ -2478,6 +2504,15 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ btrfs_scrub_cancel(root);
+
+ /* wait for any defraggers to finish */
+ wait_event(fs_info->transaction_wait,
+ (atomic_read(&fs_info->defrag_running) == 0));
+
+ /* clear out the rbtree of defraggable inodes */
+ btrfs_run_defrag_inodes(root->fs_info);
+
btrfs_put_block_group_cache(fs_info);
/*
@@ -2489,7 +2524,7 @@ int close_ctree(struct btrfs_root *root)
* ERROR state on disk.
*
* 2. when btrfs flips readonly just in btrfs_commit_super,
- * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+ * and in such case, btrfs cannot write sb via btrfs_commit_super,
* and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
* btrfs will cleanup all FS resources first and write sb then.
*/
@@ -2536,6 +2571,7 @@ int close_ctree(struct btrfs_root *root)
del_fs_roots(fs_info);
iput(fs_info->btree_inode);
+ kfree(fs_info->delayed_root);
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2547,6 +2583,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2623,6 +2660,29 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
if (current->flags & PF_MEMALLOC)
return;
+ btrfs_balance_delayed_items(root);
+
+ num_dirty = root->fs_info->dirty_metadata_bytes;
+
+ if (num_dirty > thresh) {
+ balance_dirty_pages_ratelimited_nr(
+ root->fs_info->btree_inode->i_mapping, 1);
+ }
+ return;
+}
+
+void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+ /*
+ * looks as though older kernels can get into trouble with
+ * this code, they end up stuck in balance_dirty_pages forever
+ */
+ u64 num_dirty;
+ unsigned long thresh = 32 * 1024 * 1024;
+
+ if (current->flags & PF_MEMALLOC)
+ return;
+
num_dirty = root->fs_info->dirty_metadata_bytes;
if (num_dirty > thresh) {
@@ -2655,7 +2715,7 @@ int btree_lock_page_hook(struct page *page)
goto out;
len = page->private >> 2;
- eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+ eb = find_extent_buffer(io_tree, bytenr, len);
if (!eb)
goto out;
@@ -2782,6 +2842,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
spin_lock(&delayed_refs->lock);
if (delayed_refs->num_entries == 0) {
+ spin_unlock(&delayed_refs->lock);
printk(KERN_INFO "delayed_refs has NO entry\n");
return ret;
}
@@ -2943,7 +3004,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
break;
/* opt_discard */
- ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_error_discard_extent(root, start,
+ end + 1 - start,
+ NULL);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
btrfs_error_unpin_extent_range(root, start, end);
@@ -3012,7 +3076,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
btrfs_destroy_pinned_extent(root,
root->fs_info->pinned_extents);
- t->use_count = 0;
+ atomic_set(&t->use_count, 0);
list_del_init(&t->list);
memset(t, 0, sizeof(*t));
kmem_cache_free(btrfs_transaction_cachep, t);