summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c16
-rw-r--r--fs/btrfs/btrfs_inode.h18
-rw-r--r--fs/btrfs/compression.c64
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c210
-rw-r--r--fs/btrfs/ctree.h288
-rw-r--r--fs/btrfs/delayed-inode.c1695
-rw-r--r--fs/btrfs/delayed-inode.h141
-rw-r--r--fs/btrfs/delayed-ref.c120
-rw-r--r--fs/btrfs/delayed-ref.h6
-rw-r--r--fs/btrfs/dir-item.c76
-rw-r--r--fs/btrfs/disk-io.c438
-rw-r--r--fs/btrfs/disk-io.h19
-rw-r--r--fs/btrfs/export.c33
-rw-r--r--fs/btrfs/extent-tree.c2153
-rw-r--r--fs/btrfs/extent_io.c602
-rw-r--r--fs/btrfs/extent_io.h45
-rw-r--r--fs/btrfs/extent_map.c14
-rw-r--r--fs/btrfs/extent_map.h4
-rw-r--r--fs/btrfs/file-item.c43
-rw-r--r--fs/btrfs/file.c808
-rw-r--r--fs/btrfs/free-space-cache.c1591
-rw-r--r--fs/btrfs/free-space-cache.h50
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode-map.c447
-rw-r--r--fs/btrfs/inode-map.h13
-rw-r--r--fs/btrfs/inode.c1426
-rw-r--r--fs/btrfs/ioctl.c751
-rw-r--r--fs/btrfs/ioctl.h107
-rw-r--r--fs/btrfs/locking.c25
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/lzo.c21
-rw-r--r--fs/btrfs/ordered-data.c8
-rw-r--r--fs/btrfs/ref-cache.c164
-rw-r--r--fs/btrfs/ref-cache.h24
-rw-r--r--fs/btrfs/relocation.c93
-rw-r--r--fs/btrfs/root-tree.c85
-rw-r--r--fs/btrfs/scrub.c1369
-rw-r--r--fs/btrfs/super.c122
-rw-r--r--fs/btrfs/sysfs.c77
-rw-r--r--fs/btrfs/transaction.c260
-rw-r--r--fs/btrfs/transaction.h9
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c272
-rw-r--r--fs/btrfs/tree-log.h1
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c915
-rw-r--r--fs/btrfs/volumes.h37
-rw-r--r--fs/btrfs/xattr.c53
-rw-r--r--fs/btrfs/xattr.h3
-rw-r--r--fs/btrfs/zlib.c3
52 files changed, 8971 insertions, 5801 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 31610ea73aec..9b72dcf1cd25 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,4 +7,4 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
- compression.o delayed-ref.o relocation.o
+ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9c949348510b..f66fc9959733 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -170,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
int ret;
struct posix_acl *acl = NULL;
- if (!is_owner_or_cap(dentry->d_inode))
+ if (!inode_owner_or_capable(dentry->d_inode))
return -EPERM;
if (!IS_POSIXACL(dentry->d_inode))
@@ -178,16 +178,18 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
if (value) {
acl = posix_acl_from_xattr(value, size);
- if (acl == NULL) {
- value = NULL;
- size = 0;
- } else if (IS_ERR(acl)) {
+ if (IS_ERR(acl))
return PTR_ERR(acl);
+
+ if (acl) {
+ ret = posix_acl_valid(acl);
+ if (ret)
+ goto out;
}
}
ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
-
+out:
posix_acl_release(acl);
return ret;
@@ -286,7 +288,7 @@ int btrfs_acl_chmod(struct inode *inode)
return 0;
acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
+ if (IS_ERR_OR_NULL(acl))
return PTR_ERR(acl);
clone = posix_acl_clone(acl, GFP_KERNEL);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ccc991c542df..93b1aa932014 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -22,6 +22,7 @@
#include "extent_map.h"
#include "extent_io.h"
#include "ordered-data.h"
+#include "delayed-inode.h"
/* in memory btrfs inode */
struct btrfs_inode {
@@ -136,9 +137,8 @@ struct btrfs_inode {
* items we think we'll end up using, and reserved_extents is the number
* of extent items we've reserved metadata for.
*/
- spinlock_t accounting_lock;
atomic_t outstanding_extents;
- int reserved_extents;
+ atomic_t reserved_extents;
/*
* ordered_data_close is set by truncate when a file that used
@@ -153,20 +153,34 @@ struct btrfs_inode {
unsigned ordered_data_close:1;
unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
+ unsigned in_defrag:1;
/*
* always compress this one file
*/
unsigned force_compress:4;
+ struct btrfs_delayed_node *delayed_node;
+
struct inode vfs_inode;
};
+extern unsigned char btrfs_filetype_table[];
+
static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
}
+static inline u64 btrfs_ino(struct inode *inode)
+{
+ u64 ino = BTRFS_I(inode)->location.objectid;
+
+ if (ino <= BTRFS_FIRST_FREE_OBJECTID)
+ ino = inode->i_ino;
+ return ino;
+}
+
static inline void btrfs_i_size_write(struct inode *inode, u64 size)
{
i_size_write(inode, size);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 4d2110eafe29..bfe42b03eaf9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -125,9 +125,10 @@ static int check_compressed_csum(struct inode *inode,
kunmap_atomic(kaddr, KM_USER0);
if (csum != *cb_sum) {
- printk(KERN_INFO "btrfs csum failed ino %lu "
+ printk(KERN_INFO "btrfs csum failed ino %llu "
"extent %llu csum %u "
- "wanted %u mirror %d\n", inode->i_ino,
+ "wanted %u mirror %d\n",
+ (unsigned long long)btrfs_ino(inode),
(unsigned long long)disk_start,
csum, *cb_sum, cb->mirror_num);
ret = -EIO;
@@ -332,7 +333,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
struct compressed_bio *cb;
unsigned long bytes_left;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- int page_index = 0;
+ int pg_index = 0;
struct page *page;
u64 first_byte = disk_start;
struct block_device *bdev;
@@ -340,6 +341,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+ if (!cb)
+ return -ENOMEM;
atomic_set(&cb->pending_bios, 0);
cb->errors = 0;
cb->inode = inode;
@@ -354,14 +357,18 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+ if(!bio) {
+ kfree(cb);
+ return -ENOMEM;
+ }
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
atomic_inc(&cb->pending_bios);
/* create and submit bios for the compressed pages */
bytes_left = compressed_len;
- for (page_index = 0; page_index < cb->nr_pages; page_index++) {
- page = compressed_pages[page_index];
+ for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
+ page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
if (bio->bi_size)
ret = io_tree->ops->merge_bio_hook(page, 0,
@@ -426,7 +433,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
struct compressed_bio *cb)
{
unsigned long end_index;
- unsigned long page_index;
+ unsigned long pg_index;
u64 last_offset;
u64 isize = i_size_read(inode);
int ret;
@@ -450,13 +457,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
while (last_offset < compressed_end) {
- page_index = last_offset >> PAGE_CACHE_SHIFT;
+ pg_index = last_offset >> PAGE_CACHE_SHIFT;
- if (page_index > end_index)
+ if (pg_index > end_index)
break;
rcu_read_lock();
- page = radix_tree_lookup(&mapping->page_tree, page_index);
+ page = radix_tree_lookup(&mapping->page_tree, pg_index);
rcu_read_unlock();
if (page) {
misses++;
@@ -470,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (!page)
break;
- if (add_to_page_cache_lru(page, mapping, page_index,
+ if (add_to_page_cache_lru(page, mapping, pg_index,
GFP_NOFS)) {
page_cache_release(page);
goto next;
@@ -554,7 +561,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
unsigned long compressed_len;
unsigned long nr_pages;
- unsigned long page_index;
+ unsigned long pg_index;
struct page *page;
struct block_device *bdev;
struct bio *comp_bio;
@@ -607,10 +614,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
- for (page_index = 0; page_index < nr_pages; page_index++) {
- cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+ for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+ cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
__GFP_HIGHMEM);
- if (!cb->compressed_pages[page_index])
+ if (!cb->compressed_pages[pg_index])
goto fail2;
}
cb->nr_pages = nr_pages;
@@ -628,8 +635,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio->bi_end_io = end_compressed_bio_read;
atomic_inc(&cb->pending_bios);
- for (page_index = 0; page_index < nr_pages; page_index++) {
- page = cb->compressed_pages[page_index];
+ for (pg_index = 0; pg_index < nr_pages; pg_index++) {
+ page = cb->compressed_pages[pg_index];
page->mapping = inode->i_mapping;
page->index = em_start >> PAGE_CACHE_SHIFT;
@@ -657,8 +664,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
atomic_inc(&cb->pending_bios);
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- btrfs_lookup_bio_sums(root, inode, comp_bio,
- sums);
+ ret = btrfs_lookup_bio_sums(root, inode,
+ comp_bio, sums);
+ BUG_ON(ret);
}
sums += (comp_bio->bi_size + root->sectorsize - 1) /
root->sectorsize;
@@ -683,8 +691,10 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
BUG_ON(ret);
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
- btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+ BUG_ON(ret);
+ }
ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
BUG_ON(ret);
@@ -693,8 +703,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
return 0;
fail2:
- for (page_index = 0; page_index < nr_pages; page_index++)
- free_page((unsigned long)cb->compressed_pages[page_index]);
+ for (pg_index = 0; pg_index < nr_pages; pg_index++)
+ free_page((unsigned long)cb->compressed_pages[pg_index]);
kfree(cb->compressed_pages);
fail1:
@@ -936,7 +946,7 @@ void btrfs_exit_compress(void)
int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
struct bio_vec *bvec, int vcnt,
- unsigned long *page_index,
+ unsigned long *pg_index,
unsigned long *pg_offset)
{
unsigned long buf_offset;
@@ -945,7 +955,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
unsigned long working_bytes = total_out - buf_start;
unsigned long bytes;
char *kaddr;
- struct page *page_out = bvec[*page_index].bv_page;
+ struct page *page_out = bvec[*pg_index].bv_page;
/*
* start byte is the first byte of the page we're currently
@@ -986,11 +996,11 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
/* check if we need to pick another page */
if (*pg_offset == PAGE_CACHE_SIZE) {
- (*page_index)++;
- if (*page_index >= vcnt)
+ (*pg_index)++;
+ if (*pg_index >= vcnt)
return 0;
- page_out = bvec[*page_index].bv_page;
+ page_out = bvec[*pg_index].bv_page;
*pg_offset = 0;
start_byte = page_offset(page_out) - disk_start;
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 51000174b9d7..a12059f4f0fd 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -37,7 +37,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
unsigned long total_out, u64 disk_start,
struct bio_vec *bvec, int vcnt,
- unsigned long *page_index,
+ unsigned long *pg_index,
unsigned long *pg_offset);
int btrfs_submit_compressed_write(struct inode *inode, u64 start,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5baff0dccfe..b0e18d986e0a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,11 +38,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *src_buf);
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
-static int setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
-
struct btrfs_path *btrfs_alloc_path(void)
{
@@ -107,7 +102,7 @@ void btrfs_free_path(struct btrfs_path *p)
{
if (!p)
return;
- btrfs_release_path(NULL, p);
+ btrfs_release_path(p);
kmem_cache_free(btrfs_path_cachep, p);
}
@@ -117,7 +112,7 @@ void btrfs_free_path(struct btrfs_path *p)
*
* It is safe to call this on paths that no locks or extent buffers held.
*/
-noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+noinline void btrfs_release_path(struct btrfs_path *p)
{
int i;
@@ -147,10 +142,11 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
- spin_lock(&root->node_lock);
- eb = root->node;
+
+ rcu_read_lock();
+ eb = rcu_dereference(root->node);
extent_buffer_get(eb);
- spin_unlock(&root->node_lock);
+ rcu_read_unlock();
return eb;
}
@@ -165,14 +161,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
while (1) {
eb = btrfs_root_node(root);
btrfs_tree_lock(eb);
-
- spin_lock(&root->node_lock);
- if (eb == root->node) {
- spin_unlock(&root->node_lock);
+ if (eb == root->node)
break;
- }
- spin_unlock(&root->node_lock);
-
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
}
@@ -458,10 +448,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
else
parent_start = 0;
- spin_lock(&root->node_lock);
- root->node = cow;
extent_buffer_get(cow);
- spin_unlock(&root->node_lock);
+ rcu_assign_pointer(root->node, cow);
btrfs_free_tree_block(trans, root, buf, parent_start,
last_ref);
@@ -542,6 +530,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = __btrfs_cow_block(trans, root, buf, parent,
parent_slot, cow_ret, search_start, 0);
+
+ trace_btrfs_cow_block(root, buf, *cow_ret);
+
return ret;
}
@@ -686,6 +677,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
if (!cur) {
cur = read_tree_block(root, blocknr,
blocksize, gen);
+ if (!cur)
+ return -EIO;
} else if (!uptodate) {
btrfs_read_buffer(cur, gen);
}
@@ -732,122 +725,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
return btrfs_item_offset_nr(leaf, nr - 1);
}
-/*
- * extra debugging checks to make sure all the items in a key are
- * well formed and in the proper order
- */
-static int check_node(struct btrfs_root *root, struct btrfs_path *path,
- int level)
-{
- struct extent_buffer *parent = NULL;
- struct extent_buffer *node = path->nodes[level];
- struct btrfs_disk_key parent_key;
- struct btrfs_disk_key node_key;
- int parent_slot;
- int slot;
- struct btrfs_key cpukey;
- u32 nritems = btrfs_header_nritems(node);
-
- if (path->nodes[level + 1])
- parent = path->nodes[level + 1];
-
- slot = path->slots[level];
- BUG_ON(nritems == 0);
- if (parent) {
- parent_slot = path->slots[level + 1];
- btrfs_node_key(parent, &parent_key, parent_slot);
- btrfs_node_key(node, &node_key, 0);
- BUG_ON(memcmp(&parent_key, &node_key,
- sizeof(struct btrfs_disk_key)));
- BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
- btrfs_header_bytenr(node));
- }
- BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
- if (slot != 0) {
- btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
- btrfs_node_key(node, &node_key, slot);
- BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
- }
- if (slot < nritems - 1) {
- btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
- btrfs_node_key(node, &node_key, slot);
- BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
- }
- return 0;
-}
-
-/*
- * extra checking to make sure all the items in a leaf are
- * well formed and in the proper order
- */
-static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
- int level)
-{
- struct extent_buffer *leaf = path->nodes[level];
- struct extent_buffer *parent = NULL;
- int parent_slot;
- struct btrfs_key cpukey;
- struct btrfs_disk_key parent_key;
- struct btrfs_disk_key leaf_key;
- int slot = path->slots[0];
-
- u32 nritems = btrfs_header_nritems(leaf);
-
- if (path->nodes[level + 1])
- parent = path->nodes[level + 1];
-
- if (nritems == 0)
- return 0;
-
- if (parent) {
- parent_slot = path->slots[level + 1];
- btrfs_node_key(parent, &parent_key, parent_slot);
- btrfs_item_key(leaf, &leaf_key, 0);
-
- BUG_ON(memcmp(&parent_key, &leaf_key,
- sizeof(struct btrfs_disk_key)));
- BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
- btrfs_header_bytenr(leaf));
- }
- if (slot != 0 && slot < nritems - 1) {
- btrfs_item_key(leaf, &leaf_key, slot);
- btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
- if (comp_keys(&leaf_key, &cpukey) <= 0) {
- btrfs_print_leaf(root, leaf);
- printk(KERN_CRIT "slot %d offset bad key\n", slot);
- BUG_ON(1);
- }
- if (btrfs_item_offset_nr(leaf, slot - 1) !=
- btrfs_item_end_nr(leaf, slot)) {
- btrfs_print_leaf(root, leaf);
- printk(KERN_CRIT "slot %d offset bad\n", slot);
- BUG_ON(1);
- }
- }
- if (slot < nritems - 1) {
- btrfs_item_key(leaf, &leaf_key, slot);
- btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
- BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
- if (btrfs_item_offset_nr(leaf, slot) !=
- btrfs_item_end_nr(leaf, slot + 1)) {
- btrfs_print_leaf(root, leaf);
- printk(KERN_CRIT "slot %d offset bad\n", slot);
- BUG_ON(1);
- }
- }
- BUG_ON(btrfs_item_offset_nr(leaf, 0) +
- btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
- return 0;
-}
-
-static noinline int check_block(struct btrfs_root *root,
- struct btrfs_path *path, int level)
-{
- return 0;
- if (level == 0)
- return check_leaf(root, path, level);
- return check_node(root, path, level);
-}
/*
* search for key in the extent_buffer. The items start at offset p,
@@ -1046,9 +923,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
- spin_lock(&root->node_lock);
- root->node = child;
- spin_unlock(&root->node_lock);
+ rcu_assign_pointer(root->node, child);
add_root_to_dirty_list(root);
btrfs_tree_unlock(child);
@@ -1188,7 +1063,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
}
}
/* double check we haven't messed things up */
- check_block(root, path, level);
if (orig_ptr !=
btrfs_node_blockptr(path->nodes[level], path->slots[level]))
BUG();
@@ -1449,7 +1323,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
ret = -EAGAIN;
/* release the whole path */
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
/* read the blocks */
if (block1)
@@ -1596,7 +1470,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
return 0;
}
free_extent_buffer(tmp);
- btrfs_release_path(NULL, p);
+ btrfs_release_path(p);
return -EIO;
}
}
@@ -1615,7 +1489,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
if (p->reada)
reada_for_search(root, p, level, slot, key->objectid);
- btrfs_release_path(NULL, p);
+ btrfs_release_path(p);
ret = -EAGAIN;
tmp = read_tree_block(root, blocknr, blocksize, 0);
@@ -1684,7 +1558,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
}
b = p->nodes[level];
if (!b) {
- btrfs_release_path(NULL, p);
+ btrfs_release_path(p);
goto again;
}
BUG_ON(btrfs_header_nritems(b) == 1);
@@ -1798,12 +1672,6 @@ cow_done:
if (!cow)
btrfs_unlock_up_safe(p, level + 1);
- ret = check_block(root, p, level);
- if (ret) {
- ret = -1;
- goto done;
- }
-
ret = bin_search(b, key, level, &slot);
if (level != 0) {
@@ -1880,7 +1748,7 @@ done:
if (!p->leave_spinning)
btrfs_set_path_blocking(p);
if (ret < 0)
- btrfs_release_path(root, p);
+ btrfs_release_path(p);
return ret;
}
@@ -2130,10 +1998,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
- spin_lock(&root->node_lock);
old = root->node;
- root->node = c;
- spin_unlock(&root->node_lock);
+ rcu_assign_pointer(root->node, c);
/* the super has an extra ref to root->node */
free_extent_buffer(old);
@@ -3155,7 +3021,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item);
extent_len = btrfs_file_extent_num_bytes(leaf, fi);
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
path->keep_locks = 1;
path->search_for_split = 1;
@@ -3345,7 +3211,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
u32 new_size, int from_end)
{
- int ret = 0;
int slot;
struct extent_buffer *leaf;
struct btrfs_item *item;
@@ -3443,12 +3308,11 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
btrfs_set_item_size(leaf, item, new_size);
btrfs_mark_buffer_dirty(leaf);
- ret = 0;
if (btrfs_leaf_free_space(root, leaf) < 0) {
btrfs_print_leaf(root, leaf);
BUG();
}
- return ret;
+ return 0;
}
/*
@@ -3458,7 +3322,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
u32 data_size)
{
- int ret = 0;
int slot;
struct extent_buffer *leaf;
struct btrfs_item *item;
@@ -3523,12 +3386,11 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
btrfs_set_item_size(leaf, item, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
- ret = 0;
if (btrfs_leaf_free_space(root, leaf) < 0) {
btrfs_print_leaf(root, leaf);
BUG();
}
- return ret;
+ return 0;
}
/*
@@ -3688,11 +3550,10 @@ out:
* to save stack depth by doing the bulk of the work in a function
* that doesn't call btrfs_search_slot
*/
-static noinline_for_stack int
-setup_items_for_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+int setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr)
{
struct btrfs_item *item;
int i;
@@ -3776,7 +3637,6 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
ret = 0;
if (slot == 0) {
- struct btrfs_disk_key disk_key;
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
ret = fixup_low_keys(trans, root, path, &disk_key, 1);
}
@@ -3840,7 +3700,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
unsigned long ptr;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
if (!ret) {
leaf = path->nodes[0];
@@ -4077,7 +3938,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
else
return 1;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ret;
@@ -4201,7 +4062,7 @@ find_next_key:
sret = btrfs_find_next_key(root, path, min_key, level,
cache_only, min_trans);
if (sret == 0) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto again;
} else {
goto out;
@@ -4217,6 +4078,7 @@ find_next_key:
}
btrfs_set_path_blocking(path);
cur = read_node_slot(root, cur, slot);
+ BUG_ON(!cur);
btrfs_tree_lock(cur);
@@ -4279,7 +4141,7 @@ next:
btrfs_node_key_to_cpu(c, &cur_key, slot);
orig_lowest = path->lowest_level;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, root, &cur_key, path,
0, 0);
@@ -4356,7 +4218,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
again:
level = 1;
next = NULL;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
path->keep_locks = 1;
@@ -4412,7 +4274,7 @@ again:
goto again;
if (ret < 0) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto done;
}
@@ -4451,7 +4313,7 @@ again:
goto again;
if (ret < 0) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto done;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c98b3af6052..6c093fa98f61 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -23,15 +23,18 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/fs.h>
+#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/backing-dev.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/kobject.h>
+#include <trace/events/btrfs.h>
#include <asm/kmap_types.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
+#include "ioctl.h"
struct btrfs_trans_handle;
struct btrfs_transaction;
@@ -40,6 +43,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
+extern struct kmem_cache *btrfs_free_space_cachep;
struct btrfs_ordered_sum;
#define BTRFS_MAGIC "_BHRfS_M"
@@ -103,6 +107,12 @@ struct btrfs_ordered_sum;
/* For storing free space cache */
#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+/*
+ * The inode number assigned to the special inode for sotring
+ * free ino cache
+ */
+#define BTRFS_FREE_INO_OBJECTID -12ULL
+
/* dummy objectid represents multiple objectids */
#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
@@ -185,7 +195,6 @@ struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
};
-#define BTRFS_UUID_SIZE 16
struct btrfs_dev_item {
/* the internal btrfs device id */
__le64 devid;
@@ -292,7 +301,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
sizeof(struct btrfs_stripe) * (num_stripes - 1);
}
-#define BTRFS_FSID_SIZE 16
#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
@@ -508,6 +516,12 @@ struct btrfs_extent_item_v0 {
/* use full backrefs for extent pointers in the block */
#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
+/*
+ * this flag is only used internally by scrub and may be changed at any time
+ * it is only declared here to avoid collisions
+ */
+#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
+
struct btrfs_tree_block_info {
struct btrfs_disk_key key;
u8 level;
@@ -716,7 +730,7 @@ struct btrfs_space_info {
u64 total_bytes; /* total bytes in the space,
this doesn't take mirrors into account */
u64 bytes_used; /* total bytes used,
- this does't take mirrors into account */
+ this doesn't take mirrors into account */
u64 bytes_pinned; /* total bytes pinned, will be freed when the
transaction finishes */
u64 bytes_reserved; /* total bytes the allocator has reserved for
@@ -729,10 +743,21 @@ struct btrfs_space_info {
u64 disk_total; /* total bytes on disk, takes mirrors into
account */
- int full; /* indicates that we cannot allocate any more
+ /*
+ * we bump reservation progress every time we decrement
+ * bytes_reserved. This way people waiting for reservations
+ * know something good has happened and they can check
+ * for progress. The number here isn't to be trusted, it
+ * just shows reclaim activity
+ */
+ unsigned long reservation_progress;
+
+ unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
- int force_alloc; /* set if we need to force a chunk alloc for
- this space */
+ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
+
+ unsigned int force_alloc; /* set if we need to force a chunk
+ alloc for this space */
struct list_head list;
@@ -773,9 +798,6 @@ struct btrfs_free_cluster {
/* first extent starting offset */
u64 window_start;
- /* if this cluster simply points at a bitmap in the block group */
- bool points_to_bitmap;
-
struct btrfs_block_group_cache *block_group;
/*
* when a cluster is allocated from a block group, we put the
@@ -820,9 +842,6 @@ struct btrfs_block_group_cache {
u64 bytes_super;
u64 flags;
u64 sectorsize;
- int extents_thresh;
- int free_extents;
- int total_bitmaps;
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
@@ -837,9 +856,7 @@ struct btrfs_block_group_cache {
struct btrfs_space_info *space_info;
/* free space cache stuff */
- spinlock_t tree_lock;
- struct rb_root free_space_offset;
- u64 free_space;
+ struct btrfs_free_space_ctl *free_space_ctl;
/* block group cache stuff */
struct rb_node cache_node;
@@ -859,6 +876,7 @@ struct btrfs_block_group_cache {
struct reloc_control;
struct btrfs_device;
struct btrfs_fs_devices;
+struct btrfs_delayed_root;
struct btrfs_fs_info {
u8 fsid[BTRFS_FSID_SIZE];
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
@@ -885,7 +903,10 @@ struct btrfs_fs_info {
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
- /* block reservation for extent, checksum and root tree */
+ /*
+ * block reservation for extent, checksum, root tree and
+ * delayed dir index item
+ */
struct btrfs_block_rsv global_block_rsv;
/* block reservation for delay allocation */
struct btrfs_block_rsv delalloc_block_rsv;
@@ -1012,6 +1033,7 @@ struct btrfs_fs_info {
* for the sys_munmap function call path
*/
struct btrfs_workers fixup_workers;
+ struct btrfs_workers delayed_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
int thread_pool_size;
@@ -1052,6 +1074,11 @@ struct btrfs_fs_info {
/* all metadata allocations go through this cluster */
struct btrfs_free_cluster meta_alloc_cluster;
+ /* auto defrag inodes go here */
+ spinlock_t defrag_inodes_lock;
+ struct rb_root defrag_inodes;
+ atomic_t defrag_running;
+
spinlock_t ref_cache_lock;
u64 total_ref_cache_size;
@@ -1067,8 +1094,21 @@ struct btrfs_fs_info {
void *bdev_holder;
+ /* private scrub information */
+ struct mutex scrub_lock;
+ atomic_t scrubs_running;
+ atomic_t scrub_pause_req;
+ atomic_t scrubs_paused;
+ atomic_t scrub_cancel_req;
+ wait_queue_head_t scrub_pause_wait;
+ struct rw_semaphore scrub_super_lock;
+ int scrub_workers_refcnt;
+ struct btrfs_workers scrub_workers;
+
/* filesystem state */
u64 fs_state;
+
+ struct btrfs_delayed_root *delayed_root;
};
/*
@@ -1078,9 +1118,6 @@ struct btrfs_fs_info {
struct btrfs_root {
struct extent_buffer *node;
- /* the node lock is held while changing the node pointer */
- spinlock_t node_lock;
-
struct extent_buffer *commit_root;
struct btrfs_root *log_root;
struct btrfs_root *reloc_root;
@@ -1097,6 +1134,16 @@ struct btrfs_root {
spinlock_t accounting_lock;
struct btrfs_block_rsv *block_rsv;
+ /* free ino cache stuff */
+ struct mutex fs_commit_mutex;
+ struct btrfs_free_space_ctl *free_ino_ctl;
+ enum btrfs_caching_type cached;
+ spinlock_t cache_lock;
+ wait_queue_head_t cache_wait;
+ struct btrfs_free_space_ctl *free_ino_pinned;
+ u64 cache_progress;
+ struct inode *cache_inode;
+
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
@@ -1152,12 +1199,49 @@ struct btrfs_root {
struct rb_root inode_tree;
/*
+ * radix tree that keeps track of delayed nodes of every inode,
+ * protected by inode_lock
+ */
+ struct radix_tree_root delayed_nodes_tree;
+ /*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
*/
struct super_block anon_super;
};
+struct btrfs_ioctl_defrag_range_args {
+ /* start of the defrag operation */
+ __u64 start;
+
+ /* number of bytes to defrag, use (u64)-1 to say all */
+ __u64 len;
+
+ /*
+ * flags for the operation, which can include turning
+ * on compression for this one defrag
+ */
+ __u64 flags;
+
+ /*
+ * any extent bigger than this will be considered
+ * already defragged. Use 0 to take the kernel default
+ * Use 1 to say every single extent must be rewritten
+ */
+ __u32 extent_thresh;
+
+ /*
+ * which compression method to use if turning on compression
+ * for this defrag operation. If unspecified, zlib will
+ * be used
+ */
+ __u32 compress_type;
+
+ /* spare for later */
+ __u32 unused[4];
+};
+
+
/*
* inode items have the data typically returned from stat and store other
* info about object characteristics. There is one for every file and dir in
@@ -1254,6 +1338,8 @@ struct btrfs_root {
#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
+#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1273,6 +1359,9 @@ struct btrfs_root {
#define BTRFS_INODE_NODUMP (1 << 8)
#define BTRFS_INODE_NOATIME (1 << 9)
#define BTRFS_INODE_DIRSYNC (1 << 10)
+#define BTRFS_INODE_COMPRESS (1 << 11)
+
+#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
/* some macros to generate set/get funcs for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple
@@ -1426,26 +1515,12 @@ static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
}
-static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
- struct btrfs_chunk *c, int nr,
- u64 val)
-{
- btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
-}
-
static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
struct btrfs_chunk *c, int nr)
{
return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
}
-static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
- struct btrfs_chunk *c, int nr,
- u64 val)
-{
- btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
-}
-
/* struct btrfs_block_group_item */
BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
used, 64);
@@ -1503,14 +1578,6 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
return (struct btrfs_timespec *)ptr;
}
-static inline struct btrfs_timespec *
-btrfs_inode_otime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, otime);
- return (struct btrfs_timespec *)ptr;
-}
-
BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
@@ -1861,33 +1928,6 @@ static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
return (u8 *)ptr;
}
-static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
-{
- unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
- return (u8 *)ptr;
-}
-
-static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
-{
- unsigned long ptr = offsetof(struct btrfs_header, csum);
- return (u8 *)ptr;
-}
-
-static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
-{
- return NULL;
-}
-
-static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
-{
- return NULL;
-}
-
-static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
-{
- return NULL;
-}
-
static inline int btrfs_is_leaf(struct extent_buffer *eb)
{
return btrfs_header_level(eb) == 0;
@@ -2041,22 +2081,6 @@ static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
return sb->s_fs_info;
}
-static inline int btrfs_set_root_name(struct btrfs_root *root,
- const char *name, int len)
-{
- /* if we already have a name just free it */
- kfree(root->name);
-
- root->name = kmalloc(len+1, GFP_KERNEL);
- if (!root->name)
- return -ENOMEM;
-
- memcpy(root->name, name, len);
- root->name[len] = '\0';
-
- return 0;
-}
-
static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
{
if (level == 0)
@@ -2085,6 +2109,13 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
}
/* extent-tree.c */
+static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
+ int num_items)
+{
+ return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ 3 * num_items;
+}
+
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
@@ -2094,12 +2125,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *leaf);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr);
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
u64 bytenr);
@@ -2147,6 +2175,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve, int sinfo);
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2217,8 +2247,12 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_root *root,
u64 start, u64 end);
int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes);
+ u64 num_bytes, u64 *actual_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 type);
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2270,10 +2304,12 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *parent,
int start_slot, int cache_only, u64 *last_ret,
struct btrfs_key *progress);
-void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p,
+ struct extent_buffer *held);
void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2285,13 +2321,12 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
return btrfs_del_items(trans, root, path, path->slots[0], 1);
}
+int setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, void *data, u32 data_size);
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- int nr);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -2337,16 +2372,16 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
*item);
int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_search_root(struct btrfs_root *root, u64 search_start,
- u64 *found_objectid);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
int btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+
/* dir-item.c */
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, const char *name,
- int name_len, u64 dir,
+ int name_len, struct inode *dir,
struct btrfs_key *location, u8 type, u64 index);
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -2380,6 +2415,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
const char *name, u16 name_len,
int mod);
+int verify_dir_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_dir_item *dir_item);
/* orphan.c */
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -2388,12 +2426,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
-/* inode-map.c */
-int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
- struct btrfs_root *fs_root,
- u64 dirid, u64 *objectid);
-int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
-
/* inode-item.c */
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -2438,8 +2470,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum *sums);
int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 file_start, int contig);
-int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
- u64 start, unsigned long len);
struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -2447,8 +2477,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
u64 isize);
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
- u64 end, struct list_head *list);
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ struct list_head *list, int search_commit);
/* inode.c */
/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
@@ -2477,8 +2507,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
- int sync);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_writepages(struct address_space *mapping,
@@ -2495,9 +2523,8 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
-void btrfs_put_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode);
+void btrfs_dirty_inode(struct inode *inode, int flags);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
@@ -2506,17 +2533,15 @@ void btrfs_destroy_cachep(void);
long btrfs_ioctl_trans_end(struct file *file);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root, int *was_new);
-int btrfs_commit_write(struct file *file, struct page *page,
- unsigned from, unsigned to);
struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
- size_t page_offset, u64 start, u64 end,
+ size_t pg_offset, u64 start, u64 end,
int create);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
-void btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
@@ -2524,7 +2549,7 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_cont_expand(struct inode *inode, loff_t size);
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
int btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_root *root);
@@ -2541,18 +2566,27 @@ extern const struct dentry_operations btrfs_dentry_operations;
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
void btrfs_update_iflags(struct inode *inode);
void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
-
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_pages);
/* file.c */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct inode *inode);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, int datasync);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
-int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
u64 start, u64 end, u64 *hint_byte, int drop_cache);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct inode *inode, u64 start, u64 end);
int btrfs_release_file(struct inode *inode, struct file *file);
+void btrfs_drop_pages(struct page **pages, size_t num_pages);
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+ struct page **pages, size_t num_pages,
+ loff_t pos, size_t write_bytes,
+ struct extent_state **cached);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -2561,10 +2595,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/* sysfs.c */
int btrfs_init_sysfs(void);
void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
-int btrfs_sysfs_add_root(struct btrfs_root *root);
-void btrfs_sysfs_del_root(struct btrfs_root *root);
-void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
/* xattr.c */
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -2607,4 +2637,18 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
u64 *bytes_to_reserve);
void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
+
+/* scrub.c */
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+ struct btrfs_scrub_progress *progress, int readonly);
+int btrfs_scrub_pause(struct btrfs_root *root);
+int btrfs_scrub_pause_super(struct btrfs_root *root);
+int btrfs_scrub_continue(struct btrfs_root *root);
+int btrfs_scrub_continue_super(struct btrfs_root *root);
+int btrfs_scrub_cancel(struct btrfs_root *root);
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+ struct btrfs_scrub_progress *progress);
+
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
new file mode 100644
index 000000000000..01e29503a54b
--- /dev/null
+++ b/fs/btrfs/delayed-inode.c
@@ -0,0 +1,1695 @@
+/*
+ * Copyright (C) 2011 Fujitsu. All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "delayed-inode.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+#define BTRFS_DELAYED_WRITEBACK 400
+#define BTRFS_DELAYED_BACKGROUND 100
+
+static struct kmem_cache *delayed_node_cache;
+
+int __init btrfs_delayed_inode_init(void)
+{
+ delayed_node_cache = kmem_cache_create("delayed_node",
+ sizeof(struct btrfs_delayed_node),
+ 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ NULL);
+ if (!delayed_node_cache)
+ return -ENOMEM;
+ return 0;
+}
+
+void btrfs_delayed_inode_exit(void)
+{
+ if (delayed_node_cache)
+ kmem_cache_destroy(delayed_node_cache);
+}
+
+static inline void btrfs_init_delayed_node(
+ struct btrfs_delayed_node *delayed_node,
+ struct btrfs_root *root, u64 inode_id)
+{
+ delayed_node->root = root;
+ delayed_node->inode_id = inode_id;
+ atomic_set(&delayed_node->refs, 0);
+ delayed_node->count = 0;
+ delayed_node->in_list = 0;
+ delayed_node->inode_dirty = 0;
+ delayed_node->ins_root = RB_ROOT;
+ delayed_node->del_root = RB_ROOT;
+ mutex_init(&delayed_node->mutex);
+ delayed_node->index_cnt = 0;
+ INIT_LIST_HEAD(&delayed_node->n_list);
+ INIT_LIST_HEAD(&delayed_node->p_list);
+ delayed_node->bytes_reserved = 0;
+}
+
+static inline int btrfs_is_continuous_delayed_item(
+ struct btrfs_delayed_item *item1,
+ struct btrfs_delayed_item *item2)
+{
+ if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
+ item1->key.objectid == item2->key.objectid &&
+ item1->key.type == item2->key.type &&
+ item1->key.offset + 1 == item2->key.offset)
+ return 1;
+ return 0;
+}
+
+static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
+ struct btrfs_root *root)
+{
+ return root->fs_info->delayed_root;
+}
+
+static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
+ struct inode *inode)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+ struct btrfs_root *root = btrfs_inode->root;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+again:
+ node = ACCESS_ONCE(btrfs_inode->delayed_node);
+ if (node) {
+ atomic_inc(&node->refs); /* can be accessed */
+ return node;
+ }
+
+ spin_lock(&root->inode_lock);
+ node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ if (node) {
+ if (btrfs_inode->delayed_node) {
+ spin_unlock(&root->inode_lock);
+ goto again;
+ }
+ btrfs_inode->delayed_node = node;
+ atomic_inc(&node->refs); /* can be accessed */
+ atomic_inc(&node->refs); /* cached in the inode */
+ spin_unlock(&root->inode_lock);
+ return node;
+ }
+ spin_unlock(&root->inode_lock);
+
+ node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+ if (!node)
+ return ERR_PTR(-ENOMEM);
+ btrfs_init_delayed_node(node, root, ino);
+
+ atomic_inc(&node->refs); /* cached in the btrfs inode */
+ atomic_inc(&node->refs); /* can be accessed */
+
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret) {
+ kmem_cache_free(delayed_node_cache, node);
+ return ERR_PTR(ret);
+ }
+
+ spin_lock(&root->inode_lock);
+ ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+ if (ret == -EEXIST) {
+ kmem_cache_free(delayed_node_cache, node);
+ spin_unlock(&root->inode_lock);
+ radix_tree_preload_end();
+ goto again;
+ }
+ btrfs_inode->delayed_node = node;
+ spin_unlock(&root->inode_lock);
+ radix_tree_preload_end();
+
+ return node;
+}
+
+/*
+ * Call it when holding delayed_node->mutex
+ *
+ * If mod = 1, add this node into the prepared list.
+ */
+static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
+ struct btrfs_delayed_node *node,
+ int mod)
+{
+ spin_lock(&root->lock);
+ if (node->in_list) {
+ if (!list_empty(&node->p_list))
+ list_move_tail(&node->p_list, &root->prepare_list);
+ else if (mod)
+ list_add_tail(&node->p_list, &root->prepare_list);
+ } else {
+ list_add_tail(&node->n_list, &root->node_list);
+ list_add_tail(&node->p_list, &root->prepare_list);
+ atomic_inc(&node->refs); /* inserted into list */
+ root->nodes++;
+ node->in_list = 1;
+ }
+ spin_unlock(&root->lock);
+}
+
+/* Call it when holding delayed_node->mutex */
+static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
+ struct btrfs_delayed_node *node)
+{
+ spin_lock(&root->lock);
+ if (node->in_list) {
+ root->nodes--;
+ atomic_dec(&node->refs); /* not in the list */
+ list_del_init(&node->n_list);
+ if (!list_empty(&node->p_list))
+ list_del_init(&node->p_list);
+ node->in_list = 0;
+ }
+ spin_unlock(&root->lock);
+}
+
+struct btrfs_delayed_node *btrfs_first_delayed_node(
+ struct btrfs_delayed_root *delayed_root)
+{
+ struct list_head *p;
+ struct btrfs_delayed_node *node = NULL;
+
+ spin_lock(&delayed_root->lock);
+ if (list_empty(&delayed_root->node_list))
+ goto out;
+
+ p = delayed_root->node_list.next;
+ node = list_entry(p, struct btrfs_delayed_node, n_list);
+ atomic_inc(&node->refs);
+out:
+ spin_unlock(&delayed_root->lock);
+
+ return node;
+}
+
+struct btrfs_delayed_node *btrfs_next_delayed_node(
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct list_head *p;
+ struct btrfs_delayed_node *next = NULL;
+
+ delayed_root = node->root->fs_info->delayed_root;
+ spin_lock(&delayed_root->lock);
+ if (!node->in_list) { /* not in the list */
+ if (list_empty(&delayed_root->node_list))
+ goto out;
+ p = delayed_root->node_list.next;
+ } else if (list_is_last(&node->n_list, &delayed_root->node_list))
+ goto out;
+ else
+ p = node->n_list.next;
+
+ next = list_entry(p, struct btrfs_delayed_node, n_list);
+ atomic_inc(&next->refs);
+out:
+ spin_unlock(&delayed_root->lock);
+
+ return next;
+}
+
+static void __btrfs_release_delayed_node(
+ struct btrfs_delayed_node *delayed_node,
+ int mod)
+{
+ struct btrfs_delayed_root *delayed_root;
+
+ if (!delayed_node)
+ return;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+
+ mutex_lock(&delayed_node->mutex);
+ if (delayed_node->count)
+ btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
+ else
+ btrfs_dequeue_delayed_node(delayed_root, delayed_node);
+ mutex_unlock(&delayed_node->mutex);
+
+ if (atomic_dec_and_test(&delayed_node->refs)) {
+ struct btrfs_root *root = delayed_node->root;
+ spin_lock(&root->inode_lock);
+ if (atomic_read(&delayed_node->refs) == 0) {
+ radix_tree_delete(&root->delayed_nodes_tree,
+ delayed_node->inode_id);
+ kmem_cache_free(delayed_node_cache, delayed_node);
+ }
+ spin_unlock(&root->inode_lock);
+ }
+}
+
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+{
+ __btrfs_release_delayed_node(node, 0);
+}
+
+struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+ struct btrfs_delayed_root *delayed_root)
+{
+ struct list_head *p;
+ struct btrfs_delayed_node *node = NULL;
+
+ spin_lock(&delayed_root->lock);
+ if (list_empty(&delayed_root->prepare_list))
+ goto out;
+
+ p = delayed_root->prepare_list.next;
+ list_del_init(p);
+ node = list_entry(p, struct btrfs_delayed_node, p_list);
+ atomic_inc(&node->refs);
+out:
+ spin_unlock(&delayed_root->lock);
+
+ return node;
+}
+
+static inline void btrfs_release_prepared_delayed_node(
+ struct btrfs_delayed_node *node)
+{
+ __btrfs_release_delayed_node(node, 1);
+}
+
+struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+{
+ struct btrfs_delayed_item *item;
+ item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
+ if (item) {
+ item->data_len = data_len;
+ item->ins_or_del = 0;
+ item->bytes_reserved = 0;
+ item->block_rsv = NULL;
+ item->delayed_node = NULL;
+ atomic_set(&item->refs, 1);
+ }
+ return item;
+}
+
+/*
+ * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * @delayed_node: pointer to the delayed node
+ * @key: the key to look up
+ * @prev: used to store the prev item if the right item isn't found
+ * @next: used to store the next item if the right item isn't found
+ *
+ * Note: if we don't find the right item, we will return the prev item and
+ * the next item.
+ */
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
+ struct rb_root *root,
+ struct btrfs_key *key,
+ struct btrfs_delayed_item **prev,
+ struct btrfs_delayed_item **next)
+{
+ struct rb_node *node, *prev_node = NULL;
+ struct btrfs_delayed_item *delayed_item = NULL;
+ int ret = 0;
+
+ node = root->rb_node;
+
+ while (node) {
+ delayed_item = rb_entry(node, struct btrfs_delayed_item,
+ rb_node);
+ prev_node = node;
+ ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
+ if (ret < 0)
+ node = node->rb_right;
+ else if (ret > 0)
+ node = node->rb_left;
+ else
+ return delayed_item;
+ }
+
+ if (prev) {
+ if (!prev_node)
+ *prev = NULL;
+ else if (ret < 0)
+ *prev = delayed_item;
+ else if ((node = rb_prev(prev_node)) != NULL) {
+ *prev = rb_entry(node, struct btrfs_delayed_item,
+ rb_node);
+ } else
+ *prev = NULL;
+ }
+
+ if (next) {
+ if (!prev_node)
+ *next = NULL;
+ else if (ret > 0)
+ *next = delayed_item;
+ else if ((node = rb_next(prev_node)) != NULL) {
+ *next = rb_entry(node, struct btrfs_delayed_item,
+ rb_node);
+ } else
+ *next = NULL;
+ }
+ return NULL;
+}
+
+struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+ struct btrfs_delayed_node *delayed_node,
+ struct btrfs_key *key)
+{
+ struct btrfs_delayed_item *item;
+
+ item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+ NULL, NULL);
+ return item;
+}
+
+struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item(
+ struct btrfs_delayed_node *delayed_node,
+ struct btrfs_key *key)
+{
+ struct btrfs_delayed_item *item;
+
+ item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
+ NULL, NULL);
+ return item;
+}
+
+struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item(
+ struct btrfs_delayed_node *delayed_node,
+ struct btrfs_key *key)
+{
+ struct btrfs_delayed_item *item, *next;
+
+ item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+ NULL, &next);
+ if (!item)
+ item = next;
+
+ return item;
+}
+
+struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item(
+ struct btrfs_delayed_node *delayed_node,
+ struct btrfs_key *key)
+{
+ struct btrfs_delayed_item *item, *next;
+
+ item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
+ NULL, &next);
+ if (!item)
+ item = next;
+
+ return item;
+}
+
+static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
+ struct btrfs_delayed_item *ins,
+ int action)
+{
+ struct rb_node **p, *node;
+ struct rb_node *parent_node = NULL;
+ struct rb_root *root;
+ struct btrfs_delayed_item *item;
+ int cmp;
+
+ if (action == BTRFS_DELAYED_INSERTION_ITEM)
+ root = &delayed_node->ins_root;
+ else if (action == BTRFS_DELAYED_DELETION_ITEM)
+ root = &delayed_node->del_root;
+ else
+ BUG();
+ p = &root->rb_node;
+ node = &ins->rb_node;
+
+ while (*p) {
+ parent_node = *p;
+ item = rb_entry(parent_node, struct btrfs_delayed_item,
+ rb_node);
+
+ cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
+ if (cmp < 0)
+ p = &(*p)->rb_right;
+ else if (cmp > 0)
+ p = &(*p)->rb_left;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ ins->delayed_node = delayed_node;
+ ins->ins_or_del = action;
+
+ if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
+ action == BTRFS_DELAYED_INSERTION_ITEM &&
+ ins->key.offset >= delayed_node->index_cnt)
+ delayed_node->index_cnt = ins->key.offset + 1;
+
+ delayed_node->count++;
+ atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
+ return 0;
+}
+
+static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
+ struct btrfs_delayed_item *item)
+{
+ return __btrfs_add_delayed_item(node, item,
+ BTRFS_DELAYED_INSERTION_ITEM);
+}
+
+static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
+ struct btrfs_delayed_item *item)
+{
+ return __btrfs_add_delayed_item(node, item,
+ BTRFS_DELAYED_DELETION_ITEM);
+}
+
+static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
+{
+ struct rb_root *root;
+ struct btrfs_delayed_root *delayed_root;
+
+ delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+
+ BUG_ON(!delayed_root);
+ BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
+ delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
+
+ if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+ root = &delayed_item->delayed_node->ins_root;
+ else
+ root = &delayed_item->delayed_node->del_root;
+
+ rb_erase(&delayed_item->rb_node, root);
+ delayed_item->delayed_node->count--;
+ atomic_dec(&delayed_root->items);
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
+ waitqueue_active(&delayed_root->wait))
+ wake_up(&delayed_root->wait);
+}
+
+static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
+{
+ if (item) {
+ __btrfs_remove_delayed_item(item);
+ if (atomic_dec_and_test(&item->refs))
+ kfree(item);
+ }
+}
+
+struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+ struct btrfs_delayed_node *delayed_node)
+{
+ struct rb_node *p;
+ struct btrfs_delayed_item *item = NULL;
+
+ p = rb_first(&delayed_node->ins_root);
+ if (p)
+ item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+ return item;
+}
+
+struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+ struct btrfs_delayed_node *delayed_node)
+{
+ struct rb_node *p;
+ struct btrfs_delayed_item *item = NULL;
+
+ p = rb_first(&delayed_node->del_root);
+ if (p)
+ item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+ return item;
+}
+
+struct btrfs_delayed_item *__btrfs_next_delayed_item(
+ struct btrfs_delayed_item *item)
+{
+ struct rb_node *p;
+ struct btrfs_delayed_item *next = NULL;
+
+ p = rb_next(&item->rb_node);
+ if (p)
+ next = rb_entry(p, struct btrfs_delayed_item, rb_node);
+
+ return next;
+}
+
+static inline struct btrfs_delayed_node *btrfs_get_delayed_node(
+ struct inode *inode)
+{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+ struct btrfs_delayed_node *delayed_node;
+
+ delayed_node = btrfs_inode->delayed_node;
+ if (delayed_node)
+ atomic_inc(&delayed_node->refs);
+
+ return delayed_node;
+}
+
+static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
+ u64 root_id)
+{
+ struct btrfs_key root_key;
+
+ if (root->objectid == root_id)
+ return root;
+
+ root_key.objectid = root_id;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+ return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
+}
+
+static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_item *item)
+{
+ struct btrfs_block_rsv *src_rsv;
+ struct btrfs_block_rsv *dst_rsv;
+ u64 num_bytes;
+ int ret;
+
+ if (!trans->bytes_reserved)
+ return 0;
+
+ src_rsv = trans->block_rsv;
+ dst_rsv = &root->fs_info->global_block_rsv;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ if (!ret) {
+ item->bytes_reserved = num_bytes;
+ item->block_rsv = dst_rsv;
+ }
+
+ return ret;
+}
+
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
+ struct btrfs_delayed_item *item)
+{
+ if (!item->bytes_reserved)
+ return;
+
+ btrfs_block_rsv_release(root, item->block_rsv,
+ item->bytes_reserved);
+}
+
+static int btrfs_delayed_inode_reserve_metadata(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_block_rsv *src_rsv;
+ struct btrfs_block_rsv *dst_rsv;
+ u64 num_bytes;
+ int ret;
+
+ if (!trans->bytes_reserved)
+ return 0;
+
+ src_rsv = trans->block_rsv;
+ dst_rsv = &root->fs_info->global_block_rsv;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ if (!ret)
+ node->bytes_reserved = num_bytes;
+
+ return ret;
+}
+
+static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_block_rsv *rsv;
+
+ if (!node->bytes_reserved)
+ return;
+
+ rsv = &root->fs_info->global_block_rsv;
+ btrfs_block_rsv_release(root, rsv,
+ node->bytes_reserved);
+ node->bytes_reserved = 0;
+}
+
+/*
+ * This helper will insert some continuous items into the same leaf according
+ * to the free space of the leaf.
+ */
+static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *item)
+{
+ struct btrfs_delayed_item *curr, *next;
+ int free_space;
+ int total_data_size = 0, total_size = 0;
+ struct extent_buffer *leaf;
+ char *data_ptr;
+ struct btrfs_key *keys;
+ u32 *data_size;
+ struct list_head head;
+ int slot;
+ int nitems;
+ int i;
+ int ret = 0;
+
+ BUG_ON(!path->nodes[0]);
+
+ leaf = path->nodes[0];
+ free_space = btrfs_leaf_free_space(root, leaf);
+ INIT_LIST_HEAD(&head);
+
+ next = item;
+
+ /*
+ * count the number of the continuous items that we can insert in batch
+ */
+ while (total_size + next->data_len + sizeof(struct btrfs_item) <=
+ free_space) {
+ total_data_size += next->data_len;
+ total_size += next->data_len + sizeof(struct btrfs_item);
+ list_add_tail(&next->tree_list, &head);
+ nitems++;
+
+ curr = next;
+ next = __btrfs_next_delayed_item(curr);
+ if (!next)
+ break;
+
+ if (!btrfs_is_continuous_delayed_item(curr, next))
+ break;
+ }
+
+ if (!nitems) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * we need allocate some memory space, but it might cause the task
+ * to sleep, so we set all locked nodes in the path to blocking locks
+ * first.
+ */
+ btrfs_set_path_blocking(path);
+
+ keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS);
+ if (!keys) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS);
+ if (!data_size) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ /* get keys of all the delayed items */
+ i = 0;
+ list_for_each_entry(next, &head, tree_list) {
+ keys[i] = next->key;
+ data_size[i] = next->data_len;
+ i++;
+ }
+
+ /* reset all the locked nodes in the patch to spinning locks. */
+ btrfs_clear_path_blocking(path, NULL);
+
+ /* insert the keys of the items */
+ ret = setup_items_for_insert(trans, root, path, keys, data_size,
+ total_data_size, total_size, nitems);
+ if (ret)
+ goto error;
+
+ /* insert the dir index items */
+ slot = path->slots[0];
+ list_for_each_entry_safe(curr, next, &head, tree_list) {
+ data_ptr = btrfs_item_ptr(leaf, slot, char);
+ write_extent_buffer(leaf, &curr->data,
+ (unsigned long)data_ptr,
+ curr->data_len);
+ slot++;
+
+ btrfs_delayed_item_release_metadata(root, curr);
+
+ list_del(&curr->tree_list);
+ btrfs_release_delayed_item(curr);
+ }
+
+error:
+ kfree(data_size);
+ kfree(keys);
+out:
+ return ret;
+}
+
+/*
+ * This helper can just do simple insertion that needn't extend item for new
+ * data, such as directory name index insertion, inode insertion.
+ */
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *delayed_item)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_item *item;
+ char *ptr;
+ int ret;
+
+ ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
+ delayed_item->data_len);
+ if (ret < 0 && ret != -EEXIST)
+ return ret;
+
+ leaf = path->nodes[0];
+
+ item = btrfs_item_nr(leaf, path->slots[0]);
+ ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+
+ write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
+ delayed_item->data_len);
+ btrfs_mark_buffer_dirty(leaf);
+
+ btrfs_delayed_item_release_metadata(root, delayed_item);
+ return 0;
+}
+
+/*
+ * we insert an item first, then if there are some continuous items, we try
+ * to insert those items into the same leaf.
+ */
+static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_delayed_item *curr, *prev;
+ int ret = 0;
+
+do_again:
+ mutex_lock(&node->mutex);
+ curr = __btrfs_first_delayed_insertion_item(node);
+ if (!curr)
+ goto insert_end;
+
+ ret = btrfs_insert_delayed_item(trans, root, path, curr);
+ if (ret < 0) {
+ btrfs_release_path(path);
+ goto insert_end;
+ }
+
+ prev = curr;
+ curr = __btrfs_next_delayed_item(prev);
+ if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
+ /* insert the continuous items into the same leaf */
+ path->slots[0]++;
+ btrfs_batch_insert_items(trans, root, path, curr);
+ }
+ btrfs_release_delayed_item(prev);
+ btrfs_mark_buffer_dirty(path->nodes[0]);
+
+ btrfs_release_path(path);
+ mutex_unlock(&node->mutex);
+ goto do_again;
+
+insert_end:
+ mutex_unlock(&node->mutex);
+ return ret;
+}
+
+static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_item *item)
+{
+ struct btrfs_delayed_item *curr, *next;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ struct list_head head;
+ int nitems, i, last_item;
+ int ret = 0;
+
+ BUG_ON(!path->nodes[0]);
+
+ leaf = path->nodes[0];
+
+ i = path->slots[0];
+ last_item = btrfs_header_nritems(leaf) - 1;
+ if (i > last_item)
+ return -ENOENT; /* FIXME: Is errno suitable? */
+
+ next = item;
+ INIT_LIST_HEAD(&head);
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ nitems = 0;
+ /*
+ * count the number of the dir index items that we can delete in batch
+ */
+ while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
+ list_add_tail(&next->tree_list, &head);
+ nitems++;
+
+ curr = next;
+ next = __btrfs_next_delayed_item(curr);
+ if (!next)
+ break;
+
+ if (!btrfs_is_continuous_delayed_item(curr, next))
+ break;
+
+ i++;
+ if (i > last_item)
+ break;
+ btrfs_item_key_to_cpu(leaf, &key, i);
+ }
+
+ if (!nitems)
+ return 0;
+
+ ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
+ if (ret)
+ goto out;
+
+ list_for_each_entry_safe(curr, next, &head, tree_list) {
+ btrfs_delayed_item_release_metadata(root, curr);
+ list_del(&curr->tree_list);
+ btrfs_release_delayed_item(curr);
+ }
+
+out:
+ return ret;
+}
+
+static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *root,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_delayed_item *curr, *prev;
+ int ret = 0;
+
+do_again:
+ mutex_lock(&node->mutex);
+ curr = __btrfs_first_delayed_deletion_item(node);
+ if (!curr)
+ goto delete_fail;
+
+ ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+ if (ret < 0)
+ goto delete_fail;
+ else if (ret > 0) {
+ /*
+ * can't find the item which the node points to, so this node
+ * is invalid, just drop it.
+ */
+ prev = curr;
+ curr = __btrfs_next_delayed_item(prev);
+ btrfs_release_delayed_item(prev);
+ ret = 0;
+ btrfs_release_path(path);
+ if (curr)
+ goto do_again;
+ else
+ goto delete_fail;
+ }
+
+ btrfs_batch_delete_items(trans, root, path, curr);
+ btrfs_release_path(path);
+ mutex_unlock(&node->mutex);
+ goto do_again;
+
+delete_fail:
+ btrfs_release_path(path);
+ mutex_unlock(&node->mutex);
+ return ret;
+}
+
+static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
+{
+ struct btrfs_delayed_root *delayed_root;
+
+ if (delayed_node && delayed_node->inode_dirty) {
+ BUG_ON(!delayed_node->root);
+ delayed_node->inode_dirty = 0;
+ delayed_node->count--;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+ atomic_dec(&delayed_root->items);
+ if (atomic_read(&delayed_root->items) <
+ BTRFS_DELAYED_BACKGROUND &&
+ waitqueue_active(&delayed_root->wait))
+ wake_up(&delayed_root->wait);
+ }
+}
+
+static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_key key;
+ struct btrfs_inode_item *inode_item;
+ struct extent_buffer *leaf;
+ int ret;
+
+ mutex_lock(&node->mutex);
+ if (!node->inode_dirty) {
+ mutex_unlock(&node->mutex);
+ return 0;
+ }
+
+ key.objectid = node->inode_id;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+ ret = btrfs_lookup_inode(trans, root, path, &key, 1);
+ if (ret > 0) {
+ btrfs_release_path(path);
+ mutex_unlock(&node->mutex);
+ return -ENOENT;
+ } else if (ret < 0) {
+ mutex_unlock(&node->mutex);
+ return ret;
+ }
+
+ btrfs_unlock_up_safe(path, 1);
+ leaf = path->nodes[0];
+ inode_item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_inode_item);
+ write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
+ sizeof(struct btrfs_inode_item));
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ btrfs_delayed_inode_release_metadata(root, node);
+ btrfs_release_delayed_inode(node);
+ mutex_unlock(&node->mutex);
+
+ return 0;
+}
+
+/* Called when committing the transaction. */
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_delayed_root *delayed_root;
+ struct btrfs_delayed_node *curr_node, *prev_node;
+ struct btrfs_path *path;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->leave_spinning = 1;
+
+ delayed_root = btrfs_get_delayed_root(root);
+
+ curr_node = btrfs_first_delayed_node(delayed_root);
+ while (curr_node) {
+ root = curr_node->root;
+ ret = btrfs_insert_delayed_items(trans, path, root,
+ curr_node);
+ if (!ret)
+ ret = btrfs_delete_delayed_items(trans, path, root,
+ curr_node);
+ if (!ret)
+ ret = btrfs_update_delayed_inode(trans, root, path,
+ curr_node);
+ if (ret) {
+ btrfs_release_delayed_node(curr_node);
+ break;
+ }
+
+ prev_node = curr_node;
+ curr_node = btrfs_next_delayed_node(curr_node);
+ btrfs_release_delayed_node(prev_node);
+ }
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_node *node)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ path->leave_spinning = 1;
+
+ ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+ if (!ret)
+ ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+ if (!ret)
+ ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+ int ret;
+
+ if (!delayed_node)
+ return 0;
+
+ mutex_lock(&delayed_node->mutex);
+ if (!delayed_node->count) {
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+ }
+ mutex_unlock(&delayed_node->mutex);
+
+ ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+ btrfs_release_delayed_node(delayed_node);
+ return ret;
+}
+
+void btrfs_remove_delayed_node(struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+
+ delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
+ if (!delayed_node)
+ return;
+
+ BTRFS_I(inode)->delayed_node = NULL;
+ btrfs_release_delayed_node(delayed_node);
+}
+
+struct btrfs_async_delayed_node {
+ struct btrfs_root *root;
+ struct btrfs_delayed_node *delayed_node;
+ struct btrfs_work work;
+};
+
+static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
+{
+ struct btrfs_async_delayed_node *async_node;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_path *path;
+ struct btrfs_delayed_node *delayed_node = NULL;
+ struct btrfs_root *root;
+ unsigned long nr = 0;
+ int need_requeue = 0;
+ int ret;
+
+ async_node = container_of(work, struct btrfs_async_delayed_node, work);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ goto out;
+ path->leave_spinning = 1;
+
+ delayed_node = async_node->delayed_node;
+ root = delayed_node->root;
+
+ trans = btrfs_join_transaction(root, 0);
+ if (IS_ERR(trans))
+ goto free_path;
+
+ ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
+ if (!ret)
+ ret = btrfs_delete_delayed_items(trans, path, root,
+ delayed_node);
+
+ if (!ret)
+ btrfs_update_delayed_inode(trans, root, path, delayed_node);
+
+ /*
+ * Maybe new delayed items have been inserted, so we need requeue
+ * the work. Besides that, we must dequeue the empty delayed nodes
+ * to avoid the race between delayed items balance and the worker.
+ * The race like this:
+ * Task1 Worker thread
+ * count == 0, needn't requeue
+ * also needn't insert the
+ * delayed node into prepare
+ * list again.
+ * add lots of delayed items
+ * queue the delayed node
+ * already in the list,
+ * and not in the prepare
+ * list, it means the delayed
+ * node is being dealt with
+ * by the worker.
+ * do delayed items balance
+ * the delayed node is being
+ * dealt with by the worker
+ * now, just wait.
+ * the worker goto idle.
+ * Task1 will sleep until the transaction is commited.
+ */
+ mutex_lock(&delayed_node->mutex);
+ if (delayed_node->count)
+ need_requeue = 1;
+ else
+ btrfs_dequeue_delayed_node(root->fs_info->delayed_root,
+ delayed_node);
+ mutex_unlock(&delayed_node->mutex);
+
+ nr = trans->blocks_used;
+
+ btrfs_end_transaction_dmeta(trans, root);
+ __btrfs_btree_balance_dirty(root, nr);
+free_path:
+ btrfs_free_path(path);
+out:
+ if (need_requeue)
+ btrfs_requeue_work(&async_node->work);
+ else {
+ btrfs_release_prepared_delayed_node(delayed_node);
+ kfree(async_node);
+ }
+}
+
+static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
+ struct btrfs_root *root, int all)
+{
+ struct btrfs_async_delayed_node *async_node;
+ struct btrfs_delayed_node *curr;
+ int count = 0;
+
+again:
+ curr = btrfs_first_prepared_delayed_node(delayed_root);
+ if (!curr)
+ return 0;
+
+ async_node = kmalloc(sizeof(*async_node), GFP_NOFS);
+ if (!async_node) {
+ btrfs_release_prepared_delayed_node(curr);
+ return -ENOMEM;
+ }
+
+ async_node->root = root;
+ async_node->delayed_node = curr;
+
+ async_node->work.func = btrfs_async_run_delayed_node_done;
+ async_node->work.flags = 0;
+
+ btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work);
+ count++;
+
+ if (all || count < 4)
+ goto again;
+
+ return 0;
+}
+
+void btrfs_balance_delayed_items(struct btrfs_root *root)
+{
+ struct btrfs_delayed_root *delayed_root;
+
+ delayed_root = btrfs_get_delayed_root(root);
+
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+ return;
+
+ if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+ int ret;
+ ret = btrfs_wq_run_delayed_node(delayed_root, root, 1);
+ if (ret)
+ return;
+
+ wait_event_interruptible_timeout(
+ delayed_root->wait,
+ (atomic_read(&delayed_root->items) <
+ BTRFS_DELAYED_BACKGROUND),
+ HZ);
+ return;
+ }
+
+ btrfs_wq_run_delayed_node(delayed_root, root, 0);
+}
+
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *name,
+ int name_len, struct inode *dir,
+ struct btrfs_disk_key *disk_key, u8 type,
+ u64 index)
+{
+ struct btrfs_delayed_node *delayed_node;
+ struct btrfs_delayed_item *delayed_item;
+ struct btrfs_dir_item *dir_item;
+ int ret;
+
+ delayed_node = btrfs_get_or_create_delayed_node(dir);
+ if (IS_ERR(delayed_node))
+ return PTR_ERR(delayed_node);
+
+ delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+ if (!delayed_item) {
+ ret = -ENOMEM;
+ goto release_node;
+ }
+
+ ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+ /*
+ * we have reserved enough space when we start a new transaction,
+ * so reserving metadata failure is impossible
+ */
+ BUG_ON(ret);
+
+ delayed_item->key.objectid = btrfs_ino(dir);
+ btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
+ delayed_item->key.offset = index;
+
+ dir_item = (struct btrfs_dir_item *)delayed_item->data;
+ dir_item->location = *disk_key;
+ dir_item->transid = cpu_to_le64(trans->transid);
+ dir_item->data_len = 0;
+ dir_item->name_len = cpu_to_le16(name_len);
+ dir_item->type = type;
+ memcpy((char *)(dir_item + 1), name, name_len);
+
+ mutex_lock(&delayed_node->mutex);
+ ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+ if (unlikely(ret)) {
+ printk(KERN_ERR "err add delayed dir index item(name: %s) into "
+ "the insertion tree of the delayed node"
+ "(root id: %llu, inode id: %llu, errno: %d)\n",
+ name,
+ (unsigned long long)delayed_node->root->objectid,
+ (unsigned long long)delayed_node->inode_id,
+ ret);
+ BUG();
+ }
+ mutex_unlock(&delayed_node->mutex);
+
+release_node:
+ btrfs_release_delayed_node(delayed_node);
+ return ret;
+}
+
+static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
+ struct btrfs_delayed_node *node,
+ struct btrfs_key *key)
+{
+ struct btrfs_delayed_item *item;
+
+ mutex_lock(&node->mutex);
+ item = __btrfs_lookup_delayed_insertion_item(node, key);
+ if (!item) {
+ mutex_unlock(&node->mutex);
+ return 1;
+ }
+
+ btrfs_delayed_item_release_metadata(root, item);
+ btrfs_release_delayed_item(item);
+ mutex_unlock(&node->mutex);
+ return 0;
+}
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *dir,
+ u64 index)
+{
+ struct btrfs_delayed_node *node;
+ struct btrfs_delayed_item *item;
+ struct btrfs_key item_key;
+ int ret;
+
+ node = btrfs_get_or_create_delayed_node(dir);
+ if (IS_ERR(node))
+ return PTR_ERR(node);
+
+ item_key.objectid = btrfs_ino(dir);
+ btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
+ item_key.offset = index;
+
+ ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
+ if (!ret)
+ goto end;
+
+ item = btrfs_alloc_delayed_item(0);
+ if (!item) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ item->key = item_key;
+
+ ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
+ /*
+ * we have reserved enough space when we start a new transaction,
+ * so reserving metadata failure is impossible.
+ */
+ BUG_ON(ret);
+
+ mutex_lock(&node->mutex);
+ ret = __btrfs_add_delayed_deletion_item(node, item);
+ if (unlikely(ret)) {
+ printk(KERN_ERR "err add delayed dir index item(index: %llu) "
+ "into the deletion tree of the delayed node"
+ "(root id: %llu, inode id: %llu, errno: %d)\n",
+ (unsigned long long)index,
+ (unsigned long long)node->root->objectid,
+ (unsigned long long)node->inode_id,
+ ret);
+ BUG();
+ }
+ mutex_unlock(&node->mutex);
+end:
+ btrfs_release_delayed_node(node);
+ return ret;
+}
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node = BTRFS_I(inode)->delayed_node;
+ int ret = 0;
+
+ if (!delayed_node)
+ return -ENOENT;
+
+ /*
+ * Since we have held i_mutex of this directory, it is impossible that
+ * a new directory index is added into the delayed node and index_cnt
+ * is updated now. So we needn't lock the delayed node.
+ */
+ if (!delayed_node->index_cnt)
+ return -EINVAL;
+
+ BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
+ return ret;
+}
+
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_node *delayed_node;
+ struct btrfs_delayed_item *item;
+
+ delayed_node = btrfs_get_delayed_node(inode);
+ if (!delayed_node)
+ return;
+
+ mutex_lock(&delayed_node->mutex);
+ item = __btrfs_first_delayed_insertion_item(delayed_node);
+ while (item) {
+ atomic_inc(&item->refs);
+ list_add_tail(&item->readdir_list, ins_list);
+ item = __btrfs_next_delayed_item(item);
+ }
+
+ item = __btrfs_first_delayed_deletion_item(delayed_node);
+ while (item) {
+ atomic_inc(&item->refs);
+ list_add_tail(&item->readdir_list, del_list);
+ item = __btrfs_next_delayed_item(item);
+ }
+ mutex_unlock(&delayed_node->mutex);
+ /*
+ * This delayed node is still cached in the btrfs inode, so refs
+ * must be > 1 now, and we needn't check it is going to be freed
+ * or not.
+ *
+ * Besides that, this function is used to read dir, we do not
+ * insert/delete delayed items in this period. So we also needn't
+ * requeue or dequeue this delayed node.
+ */
+ atomic_dec(&delayed_node->refs);
+}
+
+void btrfs_put_delayed_items(struct list_head *ins_list,
+ struct list_head *del_list)
+{
+ struct btrfs_delayed_item *curr, *next;
+
+ list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ list_del(&curr->readdir_list);
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+ }
+
+ list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+ list_del(&curr->readdir_list);
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+ }
+}
+
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+ u64 index)
+{
+ struct btrfs_delayed_item *curr, *next;
+ int ret;
+
+ if (list_empty(del_list))
+ return 0;
+
+ list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+ if (curr->key.offset > index)
+ break;
+
+ list_del(&curr->readdir_list);
+ ret = (curr->key.offset == index);
+
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+
+ if (ret)
+ return 1;
+ else
+ continue;
+ }
+ return 0;
+}
+
+/*
+ * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
+ *
+ */
+int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+ filldir_t filldir,
+ struct list_head *ins_list)
+{
+ struct btrfs_dir_item *di;
+ struct btrfs_delayed_item *curr, *next;
+ struct btrfs_key location;
+ char *name;
+ int name_len;
+ int over = 0;
+ unsigned char d_type;
+
+ if (list_empty(ins_list))
+ return 0;
+
+ /*
+ * Changing the data of the delayed item is impossible. So
+ * we needn't lock them. And we have held i_mutex of the
+ * directory, nobody can delete any directory indexes now.
+ */
+ list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ list_del(&curr->readdir_list);
+
+ if (curr->key.offset < filp->f_pos) {
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+ continue;
+ }
+
+ filp->f_pos = curr->key.offset;
+
+ di = (struct btrfs_dir_item *)curr->data;
+ name = (char *)(di + 1);
+ name_len = le16_to_cpu(di->name_len);
+
+ d_type = btrfs_filetype_table[di->type];
+ btrfs_disk_key_to_cpu(&location, &di->location);
+
+ over = filldir(dirent, name, name_len, curr->key.offset,
+ location.objectid, d_type);
+
+ if (atomic_dec_and_test(&curr->refs))
+ kfree(curr);
+
+ if (over)
+ return 1;
+ }
+ return 0;
+}
+
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+ generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+ sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+ transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+ nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+ block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+
+static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
+ struct btrfs_inode_item *inode_item,
+ struct inode *inode)
+{
+ btrfs_set_stack_inode_uid(inode_item, inode->i_uid);
+ btrfs_set_stack_inode_gid(inode_item, inode->i_gid);
+ btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
+ btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
+ btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
+ btrfs_set_stack_inode_generation(inode_item,
+ BTRFS_I(inode)->generation);
+ btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+ btrfs_set_stack_inode_transid(inode_item, trans->transid);
+ btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
+ btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+ btrfs_set_stack_inode_block_group(inode_item,
+ BTRFS_I(inode)->block_group);
+
+ btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+ inode->i_atime.tv_sec);
+ btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+ inode->i_atime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+ inode->i_mtime.tv_sec);
+ btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+ inode->i_mtime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+ inode->i_ctime.tv_sec);
+ btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+ inode->i_ctime.tv_nsec);
+}
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+ int ret;
+
+ delayed_node = btrfs_get_or_create_delayed_node(inode);
+ if (IS_ERR(delayed_node))
+ return PTR_ERR(delayed_node);
+
+ mutex_lock(&delayed_node->mutex);
+ if (delayed_node->inode_dirty) {
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ goto release_node;
+ }
+
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+ /*
+ * we must reserve enough space when we start a new transaction,
+ * so reserving metadata failure is impossible
+ */
+ BUG_ON(ret);
+
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+ delayed_node->inode_dirty = 1;
+ delayed_node->count++;
+ atomic_inc(&root->fs_info->delayed_root->items);
+release_node:
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return ret;
+}
+
+static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
+{
+ struct btrfs_root *root = delayed_node->root;
+ struct btrfs_delayed_item *curr_item, *prev_item;
+
+ mutex_lock(&delayed_node->mutex);
+ curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
+ while (curr_item) {
+ btrfs_delayed_item_release_metadata(root, curr_item);
+ prev_item = curr_item;
+ curr_item = __btrfs_next_delayed_item(prev_item);
+ btrfs_release_delayed_item(prev_item);
+ }
+
+ curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
+ while (curr_item) {
+ btrfs_delayed_item_release_metadata(root, curr_item);
+ prev_item = curr_item;
+ curr_item = __btrfs_next_delayed_item(prev_item);
+ btrfs_release_delayed_item(prev_item);
+ }
+
+ if (delayed_node->inode_dirty) {
+ btrfs_delayed_inode_release_metadata(root, delayed_node);
+ btrfs_release_delayed_inode(delayed_node);
+ }
+ mutex_unlock(&delayed_node->mutex);
+}
+
+void btrfs_kill_delayed_inode_items(struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+
+ delayed_node = btrfs_get_delayed_node(inode);
+ if (!delayed_node)
+ return;
+
+ __btrfs_kill_delayed_node(delayed_node);
+ btrfs_release_delayed_node(delayed_node);
+}
+
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
+{
+ u64 inode_id = 0;
+ struct btrfs_delayed_node *delayed_nodes[8];
+ int i, n;
+
+ while (1) {
+ spin_lock(&root->inode_lock);
+ n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+ (void **)delayed_nodes, inode_id,
+ ARRAY_SIZE(delayed_nodes));
+ if (!n) {
+ spin_unlock(&root->inode_lock);
+ break;
+ }
+
+ inode_id = delayed_nodes[n - 1]->inode_id + 1;
+
+ for (i = 0; i < n; i++)
+ atomic_inc(&delayed_nodes[i]->refs);
+ spin_unlock(&root->inode_lock);
+
+ for (i = 0; i < n; i++) {
+ __btrfs_kill_delayed_node(delayed_nodes[i]);
+ btrfs_release_delayed_node(delayed_nodes[i]);
+ }
+ }
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
new file mode 100644
index 000000000000..eb7d240aa648
--- /dev/null
+++ b/fs/btrfs/delayed-inode.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2011 Fujitsu. All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __DELAYED_TREE_OPERATION_H
+#define __DELAYED_TREE_OPERATION_H
+
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+
+#include "ctree.h"
+
+/* types of the delayed item */
+#define BTRFS_DELAYED_INSERTION_ITEM 1
+#define BTRFS_DELAYED_DELETION_ITEM 2
+
+struct btrfs_delayed_root {
+ spinlock_t lock;
+ struct list_head node_list;
+ /*
+ * Used for delayed nodes which is waiting to be dealt with by the
+ * worker. If the delayed node is inserted into the work queue, we
+ * drop it from this list.
+ */
+ struct list_head prepare_list;
+ atomic_t items; /* for delayed items */
+ int nodes; /* for delayed nodes */
+ wait_queue_head_t wait;
+};
+
+struct btrfs_delayed_node {
+ u64 inode_id;
+ u64 bytes_reserved;
+ struct btrfs_root *root;
+ /* Used to add the node into the delayed root's node list. */
+ struct list_head n_list;
+ /*
+ * Used to add the node into the prepare list, the nodes in this list
+ * is waiting to be dealt with by the async worker.
+ */
+ struct list_head p_list;
+ struct rb_root ins_root;
+ struct rb_root del_root;
+ struct mutex mutex;
+ struct btrfs_inode_item inode_item;
+ atomic_t refs;
+ u64 index_cnt;
+ bool in_list;
+ bool inode_dirty;
+ int count;
+};
+
+struct btrfs_delayed_item {
+ struct rb_node rb_node;
+ struct btrfs_key key;
+ struct list_head tree_list; /* used for batch insert/delete items */
+ struct list_head readdir_list; /* used for readdir items */
+ u64 bytes_reserved;
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_delayed_node *delayed_node;
+ atomic_t refs;
+ int ins_or_del;
+ u32 data_len;
+ char data[0];
+};
+
+static inline void btrfs_init_delayed_root(
+ struct btrfs_delayed_root *delayed_root)
+{
+ atomic_set(&delayed_root->items, 0);
+ delayed_root->nodes = 0;
+ spin_lock_init(&delayed_root->lock);
+ init_waitqueue_head(&delayed_root->wait);
+ INIT_LIST_HEAD(&delayed_root->node_list);
+ INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, const char *name,
+ int name_len, struct inode *dir,
+ struct btrfs_disk_key *disk_key, u8 type,
+ u64 index);
+
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *dir,
+ u64 index);
+
+int btrfs_inode_delayed_dir_index_count(struct inode *inode);
+
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+
+void btrfs_balance_delayed_items(struct btrfs_root *root);
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+ struct inode *inode);
+/* Used for evicting the inode. */
+void btrfs_remove_delayed_node(struct inode *inode);
+void btrfs_kill_delayed_inode_items(struct inode *inode);
+
+
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode);
+
+/* Used for drop dead root */
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+
+/* Used for readdir() */
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+ struct list_head *del_list);
+void btrfs_put_delayed_items(struct list_head *ins_list,
+ struct list_head *del_list);
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+ u64 index);
+int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+ filldir_t filldir,
+ struct list_head *ins_list);
+
+/* for init */
+int __init btrfs_delayed_inode_init(void);
+void btrfs_delayed_inode_exit(void);
+#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e807b143b857..125cf76fcd08 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -281,44 +281,6 @@ again:
}
/*
- * This checks to see if there are any delayed refs in the
- * btree for a given bytenr. It returns one if it finds any
- * and zero otherwise.
- *
- * If it only finds a head node, it returns 0.
- *
- * The idea is to use this when deciding if you can safely delete an
- * extent from the extent allocation tree. There may be a pending
- * ref in the rbtree that adds or removes references, so as long as this
- * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
- * allocation tree.
- */
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
-{
- struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_ref_root *delayed_refs;
- struct rb_node *prev_node;
- int ret = 0;
-
- delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
-
- ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
- if (ref) {
- prev_node = rb_prev(&ref->rb_node);
- if (!prev_node)
- goto out;
- ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
- rb_node);
- if (ref->bytenr == bytenr)
- ret = 1;
- }
-out:
- spin_unlock(&delayed_refs->lock);
- return ret;
-}
-
-/*
* helper function to update an extent delayed ref in the
* rbtree. existing and update must both have the same
* bytenr and parent
@@ -483,6 +445,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&head_ref->cluster);
mutex_init(&head_ref->mutex);
+ trace_btrfs_delayed_ref_head(ref, head_ref, action);
+
existing = tree_insert(&delayed_refs->root, &ref->rb_node);
if (existing) {
@@ -537,6 +501,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
}
full_ref->level = level;
+ trace_btrfs_delayed_tree_ref(ref, full_ref, action);
+
existing = tree_insert(&delayed_refs->root, &ref->rb_node);
if (existing) {
@@ -591,6 +557,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
full_ref->objectid = owner;
full_ref->offset = offset;
+ trace_btrfs_delayed_data_ref(ref, full_ref, action);
+
existing = tree_insert(&delayed_refs->root, &ref->rb_node);
if (existing) {
@@ -741,79 +709,3 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
return btrfs_delayed_node_to_head(ref);
return NULL;
}
-
-/*
- * add a delayed ref to the tree. This does all of the accounting required
- * to make sure the delayed ref is eventually processed before this
- * transaction commits.
- *
- * The main point of this call is to add and remove a backreference in a single
- * shot, taking the lock only once, and only searching for the head node once.
- *
- * It is the same as doing a ref add and delete in two separate calls.
- */
-#if 0
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 orig_parent,
- u64 parent, u64 orig_ref_root, u64 ref_root,
- u64 orig_ref_generation, u64 ref_generation,
- u64 owner_objectid, int pin)
-{
- struct btrfs_delayed_ref *ref;
- struct btrfs_delayed_ref *old_ref;
- struct btrfs_delayed_ref_head *head_ref;
- struct btrfs_delayed_ref_root *delayed_refs;
- int ret;
-
- ref = kmalloc(sizeof(*ref), GFP_NOFS);
- if (!ref)
- return -ENOMEM;
-
- old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
- if (!old_ref) {
- kfree(ref);
- return -ENOMEM;
- }
-
- /*
- * the parent = 0 case comes from cases where we don't actually
- * know the parent yet. It will get updated later via a add/drop
- * pair.
- */
- if (parent == 0)
- parent = bytenr;
- if (orig_parent == 0)
- orig_parent = bytenr;
-
- head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
- if (!head_ref) {
- kfree(ref);
- kfree(old_ref);
- return -ENOMEM;
- }
- delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
-
- /*
- * insert both the head node and the new ref without dropping
- * the spin lock
- */
- ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
- (u64)-1, 0, 0, 0,
- BTRFS_UPDATE_DELAYED_HEAD, 0);
- BUG_ON(ret);
-
- ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
- parent, ref_root, ref_generation,
- owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
- BUG_ON(ret);
-
- ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
- orig_parent, orig_ref_root,
- orig_ref_generation, owner_objectid,
- BTRFS_DROP_DELAYED_REF, pin);
- BUG_ON(ret);
- spin_unlock(&delayed_refs->lock);
- return 0;
-}
-#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 50e3cf92fbda..e287e3b0eab0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -166,12 +166,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
- u64 bytenr, u64 num_bytes, u64 orig_parent,
- u64 parent, u64 orig_ref_root, u64 ref_root,
- u64 orig_ref_generation, u64 ref_generation,
- u64 owner_objectid, int pin);
int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head);
int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f0cad5ae5be7..685f2593c4f0 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -50,7 +50,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
if (di)
return ERR_PTR(-EEXIST);
ret = btrfs_extend_item(trans, root, path, data_size);
- WARN_ON(ret > 0);
}
if (ret < 0)
return ERR_PTR(ret);
@@ -124,8 +123,9 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* to use for the second index (if one is created).
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, const char *name, int name_len, u64 dir,
- struct btrfs_key *location, u8 type, u64 index)
+ *root, const char *name, int name_len,
+ struct inode *dir, struct btrfs_key *location,
+ u8 type, u64 index)
{
int ret = 0;
int ret2 = 0;
@@ -137,13 +137,17 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
struct btrfs_disk_key disk_key;
u32 data_size;
- key.objectid = dir;
+ key.objectid = btrfs_ino(dir);
btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
key.offset = btrfs_name_hash(name, name_len);
path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
path->leave_spinning = 1;
+ btrfs_cpu_key_to_disk(&disk_key, location);
+
data_size = sizeof(*dir_item) + name_len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
name, name_len);
@@ -151,11 +155,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
ret = PTR_ERR(dir_item);
if (ret == -EEXIST)
goto second_insert;
- goto out;
+ goto out_free;
}
leaf = path->nodes[0];
- btrfs_cpu_key_to_disk(&disk_key, location);
btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
btrfs_set_dir_type(leaf, dir_item, type);
btrfs_set_dir_data_len(leaf, dir_item, 0);
@@ -170,29 +173,13 @@ second_insert:
/* FIXME, use some real flag for selecting the extra index */
if (root == root->fs_info->tree_root) {
ret = 0;
- goto out;
+ goto out_free;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
- btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
- key.offset = index;
- dir_item = insert_with_overflow(trans, root, path, &key, data_size,
- name, name_len);
- if (IS_ERR(dir_item)) {
- ret2 = PTR_ERR(dir_item);
- goto out;
- }
- leaf = path->nodes[0];
- btrfs_cpu_key_to_disk(&disk_key, location);
- btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
- btrfs_set_dir_type(leaf, dir_item, type);
- btrfs_set_dir_data_len(leaf, dir_item, 0);
- btrfs_set_dir_name_len(leaf, dir_item, name_len);
- btrfs_set_dir_transid(leaf, dir_item, trans->transid);
- name_ptr = (unsigned long)(dir_item + 1);
- write_extent_buffer(leaf, name, name_ptr, name_len);
- btrfs_mark_buffer_dirty(leaf);
-out:
+ ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir,
+ &disk_key, type, index);
+out_free:
btrfs_free_path(path);
if (ret)
return ret;
@@ -377,6 +364,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
leaf = path->nodes[0];
dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+ if (verify_dir_item(root, leaf, dir_item))
+ return NULL;
+
total_len = btrfs_item_size_nr(leaf, path->slots[0]);
while (cur < total_len) {
this_len = sizeof(*dir_item) +
@@ -429,3 +419,35 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
}
return ret;
}
+
+int verify_dir_item(struct btrfs_root *root,
+ struct extent_buffer *leaf,
+ struct btrfs_dir_item *dir_item)
+{
+ u16 namelen = BTRFS_NAME_LEN;
+ u8 type = btrfs_dir_type(leaf, dir_item);
+
+ if (type >= BTRFS_FT_MAX) {
+ printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
+ (int)type);
+ return 1;
+ }
+
+ if (type == BTRFS_FT_XATTR)
+ namelen = XATTR_NAME_MAX;
+
+ if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
+ printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n",
+ (unsigned)btrfs_dir_data_len(leaf, dir_item));
+ return 1;
+ }
+
+ /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
+ if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
+ printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
+ (unsigned)btrfs_dir_data_len(leaf, dir_item));
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fdce8799b98d..98b6a71decba 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,6 +29,8 @@
#include <linux/crc32c.h>
#include <linux/slab.h>
#include <linux/migrate.h>
+#include <linux/ratelimit.h>
+#include <asm/unaligned.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -40,6 +42,7 @@
#include "locking.h"
#include "tree-log.h"
#include "free-space-cache.h"
+#include "inode-map.h"
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
@@ -136,7 +139,7 @@ static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
* that covers the entire device
*/
static struct extent_map *btree_get_extent(struct inode *inode,
- struct page *page, size_t page_offset, u64 start, u64 len,
+ struct page *page, size_t pg_offset, u64 start, u64 len,
int create)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -153,7 +156,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
}
read_unlock(&em_tree->lock);
- em = alloc_extent_map(GFP_NOFS);
+ em = alloc_extent_map();
if (!em) {
em = ERR_PTR(-ENOMEM);
goto out;
@@ -198,7 +201,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
void btrfs_csum_final(u32 crc, char *result)
{
- *(__le32 *)result = ~cpu_to_le32(crc);
+ put_unaligned_le32(~crc, result);
}
/*
@@ -253,14 +256,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- if (printk_ratelimit()) {
- printk(KERN_INFO "btrfs: %s checksum verify "
+ printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
"failed on %llu wanted %X found %X "
"level %d\n",
root->fs_info->sb->s_id,
(unsigned long long)buf->start, val, found,
btrfs_header_level(buf));
- }
if (result != (char *)&inline_result)
kfree(result);
return 1;
@@ -295,13 +296,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- if (printk_ratelimit()) {
- printk("parent transid verify failed on %llu wanted %llu "
+ printk_ratelimited("parent transid verify failed on %llu wanted %llu "
"found %llu\n",
(unsigned long long)eb->start,
(unsigned long long)parent_transid,
(unsigned long long)btrfs_header_generation(eb));
- }
ret = 1;
clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
out:
@@ -323,6 +322,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
int num_copies = 0;
int mirror_num = 0;
+ clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
while (1) {
ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -331,6 +331,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
!verify_parent_transid(io_tree, eb, parent_transid))
return ret;
+ /*
+ * This buffer's crc is fine, but its contents are corrupted, so
+ * there is no reason to read the other copies, they won't be
+ * any less wrong.
+ */
+ if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+ return ret;
+
num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
eb->start, eb->len);
if (num_copies == 1)
@@ -359,14 +367,18 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
tree = &BTRFS_I(page->mapping->host)->io_tree;
- if (page->private == EXTENT_PAGE_PRIVATE)
+ if (page->private == EXTENT_PAGE_PRIVATE) {
+ WARN_ON(1);
goto out;
- if (!page->private)
+ }
+ if (!page->private) {
+ WARN_ON(1);
goto out;
+ }
len = page->private >> 2;
WARN_ON(len == 0);
- eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ eb = alloc_extent_buffer(tree, start, len, page);
if (eb == NULL) {
WARN_ON(1);
goto out;
@@ -415,6 +427,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
return ret;
}
+#define CORRUPT(reason, eb, root, slot) \
+ printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
+ "root=%llu, slot=%d\n", reason, \
+ (unsigned long long)btrfs_header_bytenr(eb), \
+ (unsigned long long)root->objectid, slot)
+
+static noinline int check_leaf(struct btrfs_root *root,
+ struct extent_buffer *leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_key leaf_key;
+ u32 nritems = btrfs_header_nritems(leaf);
+ int slot;
+
+ if (nritems == 0)
+ return 0;
+
+ /* Check the 0 item */
+ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("invalid item offset size pair", leaf, root, 0);
+ return -EIO;
+ }
+
+ /*
+ * Check to make sure each items keys are in the correct order and their
+ * offsets make sense. We only have to loop through nritems-1 because
+ * we check the current slot against the next slot, which verifies the
+ * next slot's offset+size makes sense and that the current's slot
+ * offset is correct.
+ */
+ for (slot = 0; slot < nritems - 1; slot++) {
+ btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+ btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+
+ /* Make sure the keys are in the right order */
+ if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+ CORRUPT("bad key order", leaf, root, slot);
+ return -EIO;
+ }
+
+ /*
+ * Make sure the offset and ends are right, remember that the
+ * item data starts at the end of the leaf and grows towards the
+ * front.
+ */
+ if (btrfs_item_offset_nr(leaf, slot) !=
+ btrfs_item_end_nr(leaf, slot + 1)) {
+ CORRUPT("slot offset bad", leaf, root, slot);
+ return -EIO;
+ }
+
+ /*
+ * Check to make sure that we don't point outside of the leaf,
+ * just incase all the items are consistent to eachother, but
+ * all point outside of the leaf.
+ */
+ if (btrfs_item_end_nr(leaf, slot) >
+ BTRFS_LEAF_DATA_SIZE(root)) {
+ CORRUPT("slot end outside of leaf", leaf, root, slot);
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
{
@@ -444,7 +523,7 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
len = page->private >> 2;
WARN_ON(len == 0);
- eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+ eb = alloc_extent_buffer(tree, start, len, page);
if (eb == NULL) {
ret = -EIO;
goto out;
@@ -452,12 +531,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
found_start = btrfs_header_bytenr(eb);
if (found_start != start) {
- if (printk_ratelimit()) {
- printk(KERN_INFO "btrfs bad tree block start "
+ printk_ratelimited(KERN_INFO "btrfs bad tree block start "
"%llu %llu\n",
(unsigned long long)found_start,
(unsigned long long)eb->start);
- }
ret = -EIO;
goto err;
}
@@ -469,10 +546,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- if (printk_ratelimit()) {
- printk(KERN_INFO "btrfs bad fsid on block %llu\n",
+ printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
(unsigned long long)eb->start);
- }
ret = -EIO;
goto err;
}
@@ -481,8 +556,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
btrfs_set_buffer_lockdep_class(eb, found_level);
ret = csum_tree_block(root, eb, 1);
- if (ret)
+ if (ret) {
ret = -EIO;
+ goto err;
+ }
+
+ /*
+ * If this is a leaf block and it is corrupt, set the corrupt bit so
+ * that we don't try and read the other copies of this block, just
+ * return -EIO.
+ */
+ if (found_level == 0 && check_leaf(root, eb)) {
+ set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ ret = -EIO;
+ }
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
@@ -557,12 +644,6 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
return 256 * limit;
}
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
-{
- return atomic_read(&info->nr_async_bios) >
- btrfs_async_submit_limit(info);
-}
-
static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async;
@@ -843,7 +924,6 @@ static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
- .sync_page = block_sync_page,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
@@ -871,7 +951,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
struct inode *btree_inode = root->fs_info->btree_inode;
struct extent_buffer *eb;
eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
- bytenr, blocksize, GFP_NOFS);
+ bytenr, blocksize);
return eb;
}
@@ -882,7 +962,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
struct extent_buffer *eb;
eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
- bytenr, blocksize, NULL, GFP_NOFS);
+ bytenr, blocksize, NULL);
return eb;
}
@@ -966,13 +1046,13 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree = RB_ROOT;
+ INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
root->block_rsv = NULL;
root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
- spin_lock_init(&root->node_lock);
spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->accounting_lock);
@@ -988,7 +1068,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->log_transid = 0;
root->last_log_commit = 0;
extent_io_tree_init(&root->dirty_log_pages,
- fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->btree_inode->i_mapping);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
@@ -1156,7 +1236,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
root, fs_info, location->objectid);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path) {
+ kfree(root);
+ return ERR_PTR(-ENOMEM);
+ }
ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
if (ret == 0) {
l = path->nodes[0];
@@ -1180,27 +1263,14 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
out:
- if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
+ if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
root->ref_cows = 1;
+ btrfs_check_and_init_root_item(&root->root_item);
+ }
return root;
}
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_objectid)
-{
- struct btrfs_root *root;
-
- if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
- return fs_info->tree_root;
- if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
- return fs_info->extent_root;
-
- root = radix_tree_lookup(&fs_info->fs_roots_radix,
- (unsigned long)root_objectid);
- return root;
-}
-
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location)
{
@@ -1229,6 +1299,19 @@ again:
if (IS_ERR(root))
return root;
+ root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+ if (!root->free_ino_ctl)
+ goto fail;
+ root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+ GFP_NOFS);
+ if (!root->free_ino_pinned)
+ goto fail;
+
+ btrfs_init_free_ino_ctl(root);
+ mutex_init(&root->fs_commit_mutex);
+ spin_lock_init(&root->cache_lock);
+ init_waitqueue_head(&root->cache_wait);
+
set_anon_super(&root->anon_super, NULL);
if (btrfs_root_refs(&root->root_item) == 0) {
@@ -1272,41 +1355,6 @@ fail:
return ERR_PTR(ret);
}
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location,
- const char *name, int namelen)
-{
- return btrfs_read_fs_root_no_name(fs_info, location);
-#if 0
- struct btrfs_root *root;
- int ret;
-
- root = btrfs_read_fs_root_no_name(fs_info, location);
- if (!root)
- return NULL;
-
- if (root->in_sysfs)
- return root;
-
- ret = btrfs_set_root_name(root, name, namelen);
- if (ret) {
- free_extent_buffer(root->node);
- kfree(root);
- return ERR_PTR(ret);
- }
-
- ret = btrfs_sysfs_add_root(root);
- if (ret) {
- free_extent_buffer(root->node);
- kfree(root->name);
- kfree(root);
- return ERR_PTR(ret);
- }
- root->in_sysfs = 1;
- return root;
-#endif
-}
-
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
{
struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
@@ -1314,7 +1362,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
struct btrfs_device *device;
struct backing_dev_info *bdi;
- list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
if (!device->bdev)
continue;
bdi = blk_get_backing_dev_info(device->bdev);
@@ -1323,86 +1372,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
break;
}
}
+ rcu_read_unlock();
return ret;
}
/*
- * this unplugs every device on the box, and it is only used when page
- * is null
- */
-static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
- struct btrfs_device *device;
- struct btrfs_fs_info *info;
-
- info = (struct btrfs_fs_info *)bdi->unplug_io_data;
- list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
- if (!device->bdev)
- continue;
-
- bdi = blk_get_backing_dev_info(device->bdev);
- if (bdi->unplug_io_fn)
- bdi->unplug_io_fn(bdi, page);
- }
-}
-
-static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
- struct inode *inode;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct address_space *mapping;
- u64 offset;
-
- /* the generic O_DIRECT read code does this */
- if (1 || !page) {
- __unplug_io_fn(bdi, page);
- return;
- }
-
- /*
- * page->mapping may change at any time. Get a consistent copy
- * and use that for everything below
- */
- smp_mb();
- mapping = page->mapping;
- if (!mapping)
- return;
-
- inode = mapping->host;
-
- /*
- * don't do the expensive searching for a small number of
- * devices
- */
- if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
- __unplug_io_fn(bdi, page);
- return;
- }
-
- offset = page_offset(page);
-
- em_tree = &BTRFS_I(inode)->extent_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
- read_unlock(&em_tree->lock);
- if (!em) {
- __unplug_io_fn(bdi, page);
- return;
- }
-
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- __unplug_io_fn(bdi, page);
- return;
- }
- offset = offset - em->start;
- btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
- em->block_start + offset, page);
- free_extent_map(em);
-}
-
-/*
* If this fails, caller must call bdi_destroy() to get rid of the
* bdi again.
*/
@@ -1416,8 +1390,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
return err;
bdi->ra_pages = default_backing_dev_info.ra_pages;
- bdi->unplug_io_fn = btrfs_unplug_io_fn;
- bdi->unplug_io_data = info;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
return 0;
@@ -1503,6 +1475,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_delayed_iputs(root);
btrfs_clean_old_snapshots(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
+ btrfs_run_defrag_inodes(root->fs_info);
}
if (freezing(current)) {
@@ -1592,7 +1565,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_root *tree_root = btrfs_sb(sb);
- struct btrfs_fs_info *fs_info = tree_root->fs_info;
+ struct btrfs_fs_info *fs_info = NULL;
struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
GFP_NOFS);
struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1604,11 +1577,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
struct btrfs_super_block *disk_super;
- if (!extent_root || !tree_root || !fs_info ||
+ if (!extent_root || !tree_root || !tree_root->fs_info ||
!chunk_root || !dev_root || !csum_root) {
err = -ENOMEM;
goto fail;
}
+ fs_info = tree_root->fs_info;
ret = init_srcu_struct(&fs_info->subvol_srcu);
if (ret) {
@@ -1628,6 +1602,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
goto fail_bdi;
}
+ fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
+
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1641,6 +1617,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
+ spin_lock_init(&fs_info->defrag_inodes_lock);
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
@@ -1663,15 +1640,35 @@ struct btrfs_root *open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
+ atomic_set(&fs_info->defrag_running, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 0;
+ fs_info->defrag_inodes = RB_ROOT;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
INIT_LIST_HEAD(&fs_info->ordered_extents);
spin_lock_init(&fs_info->ordered_extent_lock);
+ fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+ GFP_NOFS);
+ if (!fs_info->delayed_root) {
+ err = -ENOMEM;
+ goto fail_iput;
+ }
+ btrfs_init_delayed_root(fs_info->delayed_root);
+
+ mutex_init(&fs_info->scrub_lock);
+ atomic_set(&fs_info->scrubs_running, 0);
+ atomic_set(&fs_info->scrub_pause_req, 0);
+ atomic_set(&fs_info->scrubs_paused, 0);
+ atomic_set(&fs_info->scrub_cancel_req, 0);
+ init_waitqueue_head(&fs_info->scrub_pause_wait);
+ init_rwsem(&fs_info->scrub_super_lock);
+ fs_info->scrub_workers_refcnt = 0;
+ btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+ fs_info->thread_pool_size, &fs_info->generic_worker);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
@@ -1690,10 +1687,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
- fs_info->btree_inode->i_mapping,
- GFP_NOFS);
- extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
- GFP_NOFS);
+ fs_info->btree_inode->i_mapping);
+ extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
@@ -1707,9 +1702,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->block_group_cache_tree = RB_ROOT;
extent_io_tree_init(&fs_info->freed_extents[0],
- fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->btree_inode->i_mapping);
extent_io_tree_init(&fs_info->freed_extents[1],
- fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->btree_inode->i_mapping);
fs_info->pinned_extents = &fs_info->freed_extents[0];
fs_info->do_barriers = 1;
@@ -1739,7 +1734,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
if (!bh) {
err = -EINVAL;
- goto fail_iput;
+ goto fail_alloc;
}
memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
@@ -1751,17 +1746,23 @@ struct btrfs_root *open_ctree(struct super_block *sb,
disk_super = &fs_info->super_copy;
if (!btrfs_super_root(disk_super))
- goto fail_iput;
+ goto fail_alloc;
/* check FS state, whether FS is broken. */
fs_info->fs_state |= btrfs_super_flags(disk_super);
btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+ /*
+ * In the long term, we'll store the compression type in the super
+ * block, and it'll be used for per file compression control.
+ */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
ret = btrfs_parse_options(tree_root, options);
if (ret) {
err = ret;
- goto fail_iput;
+ goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super) &
@@ -1771,7 +1772,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
"unsupported optional features (%Lx).\n",
(unsigned long long)features);
err = -EINVAL;
- goto fail_iput;
+ goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super);
@@ -1787,7 +1788,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
"unsupported option features (%Lx).\n",
(unsigned long long)features);
err = -EINVAL;
- goto fail_iput;
+ goto fail_alloc;
}
btrfs_init_workers(&fs_info->generic_worker,
@@ -1834,6 +1835,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
&fs_info->generic_worker);
btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1, &fs_info->generic_worker);
+ btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
+ fs_info->thread_pool_size,
+ &fs_info->generic_worker);
/*
* endios are largely parallel and should have a very
@@ -1855,6 +1859,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
+ btrfs_start_workers(&fs_info->delayed_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1963,6 +1968,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->metadata_alloc_profile = (u64)-1;
fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+ ret = btrfs_init_space_info(fs_info);
+ if (ret) {
+ printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+ goto fail_block_groups;
+ }
+
ret = btrfs_read_block_groups(extent_root);
if (ret) {
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -2054,9 +2065,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
if (!(sb->s_flags & MS_RDONLY)) {
down_read(&fs_info->cleanup_work_sem);
- btrfs_orphan_cleanup(fs_info->fs_root);
- btrfs_orphan_cleanup(fs_info->tree_root);
+ err = btrfs_orphan_cleanup(fs_info->fs_root);
+ if (!err)
+ err = btrfs_orphan_cleanup(fs_info->tree_root);
up_read(&fs_info->cleanup_work_sem);
+ if (err) {
+ close_ctree(tree_root);
+ return ERR_PTR(err);
+ }
}
return tree_root;
@@ -2100,6 +2116,9 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->delayed_workers);
+fail_alloc:
+ kfree(fs_info->delayed_root);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -2127,11 +2146,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- if (printk_ratelimit()) {
- printk(KERN_WARNING "lost page write due to "
+ printk_ratelimited(KERN_WARNING "lost page write due to "
"I/O error on %s\n",
bdevname(bh->b_bdev, b));
- }
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
*/
@@ -2295,7 +2312,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
head = &root->fs_info->fs_devices->devices;
- list_for_each_entry(dev, head, dev_list) {
+ list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev) {
total_errors++;
continue;
@@ -2328,7 +2345,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
total_errors = 0;
- list_for_each_entry(dev, head, dev_list) {
+ list_for_each_entry_rcu(dev, head, dev_list) {
if (!dev->bdev)
continue;
if (!dev->in_fs_metadata || !dev->writeable)
@@ -2366,12 +2383,15 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
if (btrfs_root_refs(&root->root_item) == 0)
synchronize_srcu(&fs_info->subvol_srcu);
+ __btrfs_remove_free_space_cache(root->free_ino_pinned);
+ __btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
return 0;
}
static void free_fs_root(struct btrfs_root *root)
{
+ iput(root->cache_inode);
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
if (root->anon_super.s_dev) {
down_write(&root->anon_super.s_umount);
@@ -2379,6 +2399,8 @@ static void free_fs_root(struct btrfs_root *root)
}
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
+ kfree(root->free_ino_ctl);
+ kfree(root->free_ino_pinned);
kfree(root->name);
kfree(root);
}
@@ -2431,8 +2453,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
root_objectid = gang[ret - 1]->root_key.objectid + 1;
for (i = 0; i < ret; i++) {
+ int err;
+
root_objectid = gang[i]->root_key.objectid;
- btrfs_orphan_cleanup(gang[i]);
+ err = btrfs_orphan_cleanup(gang[i]);
+ if (err)
+ return err;
}
root_objectid++;
}
@@ -2478,6 +2504,15 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ btrfs_scrub_cancel(root);
+
+ /* wait for any defraggers to finish */
+ wait_event(fs_info->transaction_wait,
+ (atomic_read(&fs_info->defrag_running) == 0));
+
+ /* clear out the rbtree of defraggable inodes */
+ btrfs_run_defrag_inodes(root->fs_info);
+
btrfs_put_block_group_cache(fs_info);
/*
@@ -2489,7 +2524,7 @@ int close_ctree(struct btrfs_root *root)
* ERROR state on disk.
*
* 2. when btrfs flips readonly just in btrfs_commit_super,
- * and in such case, btrfs cannnot write sb via btrfs_commit_super,
+ * and in such case, btrfs cannot write sb via btrfs_commit_super,
* and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
* btrfs will cleanup all FS resources first and write sb then.
*/
@@ -2536,6 +2571,7 @@ int close_ctree(struct btrfs_root *root)
del_fs_roots(fs_info);
iput(fs_info->btree_inode);
+ kfree(fs_info->delayed_root);
btrfs_stop_workers(&fs_info->generic_worker);
btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2547,6 +2583,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->endio_freespace_worker);
btrfs_stop_workers(&fs_info->submit_workers);
+ btrfs_stop_workers(&fs_info->delayed_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2623,6 +2660,29 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
if (current->flags & PF_MEMALLOC)
return;
+ btrfs_balance_delayed_items(root);
+
+ num_dirty = root->fs_info->dirty_metadata_bytes;
+
+ if (num_dirty > thresh) {
+ balance_dirty_pages_ratelimited_nr(
+ root->fs_info->btree_inode->i_mapping, 1);
+ }
+ return;
+}
+
+void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+ /*
+ * looks as though older kernels can get into trouble with
+ * this code, they end up stuck in balance_dirty_pages forever
+ */
+ u64 num_dirty;
+ unsigned long thresh = 32 * 1024 * 1024;
+
+ if (current->flags & PF_MEMALLOC)
+ return;
+
num_dirty = root->fs_info->dirty_metadata_bytes;
if (num_dirty > thresh) {
@@ -2655,7 +2715,7 @@ int btree_lock_page_hook(struct page *page)
goto out;
len = page->private >> 2;
- eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+ eb = find_extent_buffer(io_tree, bytenr, len);
if (!eb)
goto out;
@@ -2782,6 +2842,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
spin_lock(&delayed_refs->lock);
if (delayed_refs->num_entries == 0) {
+ spin_unlock(&delayed_refs->lock);
printk(KERN_INFO "delayed_refs has NO entry\n");
return ret;
}
@@ -2943,7 +3004,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
break;
/* opt_discard */
- ret = btrfs_error_discard_extent(root, start, end + 1 - start);
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_error_discard_extent(root, start,
+ end + 1 - start,
+ NULL);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
btrfs_error_unpin_extent_range(root, start, end);
@@ -3012,7 +3076,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
btrfs_destroy_pinned_extent(root,
root->fs_info->pinned_extents);
- t->use_count = 0;
+ atomic_set(&t->use_count, 0);
list_del_init(&t->list);
memset(t, 0, sizeof(*t));
kmem_cache_free(btrfs_transaction_cachep, t);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 07b20dc2fd95..a0b610a67aae 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -55,35 +55,20 @@ int btrfs_commit_super(struct btrfs_root *root);
int btrfs_error_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_objectid);
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location,
- const char *name, int namelen);
struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
struct btrfs_key *location);
struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
struct btrfs_key *location);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-int btrfs_insert_dev_radix(struct btrfs_root *root,
- struct block_device *bdev,
- u64 device_id,
- u64 block_start,
- u64 num_blocks);
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
-int wait_on_tree_block_writeback(struct btrfs_root *root,
- struct extent_buffer *buf);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
-int btrfs_open_device(struct btrfs_device *dev);
-int btrfs_verify_block_csum(struct btrfs_root *root,
- struct extent_buffer *buf);
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
@@ -91,8 +76,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
unsigned long bio_flags, u64 bio_offset,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done);
-
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
int btrfs_write_tree_block(struct extent_buffer *buf);
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ff27d7a477b2..1b8dc33778f9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,14 +21,18 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
int len = *max_len;
int type;
- if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
- (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+ if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+ *max_len = BTRFS_FID_SIZE_CONNECTABLE;
return 255;
+ } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+ *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+ return 255;
+ }
len = BTRFS_FID_SIZE_NON_CONNECTABLE;
type = FILEID_BTRFS_WITHOUT_PARENT;
- fid->objectid = inode->i_ino;
+ fid->objectid = btrfs_ino(inode);
fid->root_objectid = BTRFS_I(inode)->root->objectid;
fid->gen = inode->i_generation;
@@ -174,13 +178,13 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
if (!path)
return ERR_PTR(-ENOMEM);
- if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = root->fs_info->tree_root;
} else {
- key.objectid = dir->i_ino;
+ key.objectid = btrfs_ino(dir);
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
}
@@ -240,6 +244,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
struct btrfs_key key;
int name_len;
int ret;
+ u64 ino;
if (!dir || !inode)
return -EINVAL;
@@ -247,19 +252,21 @@ static int btrfs_get_name(struct dentry *parent, char *name,
if (!S_ISDIR(dir->i_mode))
return -EINVAL;
+ ino = btrfs_ino(inode);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->leave_spinning = 1;
- if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ if (ino == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = BTRFS_I(inode)->root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = root->fs_info->tree_root;
} else {
- key.objectid = inode->i_ino;
- key.offset = dir->i_ino;
+ key.objectid = ino;
+ key.offset = btrfs_ino(dir);
key.type = BTRFS_INODE_REF_KEY;
}
@@ -268,7 +275,7 @@ static int btrfs_get_name(struct dentry *parent, char *name,
btrfs_free_path(path);
return ret;
} else if (ret > 0) {
- if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ if (ino == BTRFS_FIRST_FREE_OBJECTID) {
path->slots[0]--;
} else {
btrfs_free_path(path);
@@ -277,11 +284,11 @@ static int btrfs_get_name(struct dentry *parent, char *name,
}
leaf = path->nodes[0];
- if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
- rref = btrfs_item_ptr(leaf, path->slots[0],
+ if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ rref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
- name_ptr = (unsigned long)(rref + 1);
- name_len = btrfs_root_ref_name_len(leaf, rref);
+ name_ptr = (unsigned long)(rref + 1);
+ name_len = btrfs_root_ref_name_len(leaf, rref);
} else {
iref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_ref);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4e7e012ad667..169bd62ce776 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,11 +33,28 @@
#include "locking.h"
#include "free-space-cache.h"
+/* control flags for do_chunk_alloc's force field
+ * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
+ * if we really need one.
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one
+ * if we have very few chunks already allocated. This is
+ * used as part of the clustering code to help make sure
+ * we have a good pool of storage to cluster in, without
+ * filling the FS with empty chunks
+ *
+ */
+enum {
+ CHUNK_ALLOC_NO_FORCE = 0,
+ CHUNK_ALLOC_FORCE = 1,
+ CHUNK_ALLOC_LIMITED = 2,
+};
+
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc);
-static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve, int sinfo);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -77,7 +94,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
return (cache->flags & bits) == bits;
}
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
{
atomic_inc(&cache->count);
}
@@ -88,6 +105,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
WARN_ON(cache->pinned > 0);
WARN_ON(cache->reserved > 0);
WARN_ON(cache->reserved_pinned > 0);
+ kfree(cache->free_space_ctl);
kfree(cache);
}
}
@@ -362,7 +380,7 @@ again:
break;
caching_ctl->progress = last;
- btrfs_release_path(extent_root, path);
+ btrfs_release_path(path);
up_read(&fs_info->extent_commit_sem);
mutex_unlock(&caching_ctl->mutex);
if (btrfs_transaction_in_commit(fs_info))
@@ -442,7 +460,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
* allocate blocks for the tree root we can't do the fast caching since
* we likely hold important locks.
*/
- if (!trans->transaction->in_commit &&
+ if (trans && (!trans->transaction->in_commit) &&
(root && root != root->fs_info->tree_root)) {
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
@@ -471,7 +489,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
if (load_cache_only)
return 0;
- caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
BUG_ON(!caching_ctl);
INIT_LIST_HEAD(&caching_ctl->list);
@@ -737,8 +755,12 @@ again:
atomic_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
- btrfs_release_path(root->fs_info->extent_root, path);
+ btrfs_release_path(path);
+ /*
+ * Mutex was contended, block until it's released and try
+ * again
+ */
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref(&head->node);
@@ -917,7 +939,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
break;
}
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (owner < BTRFS_FIRST_FREE_OBJECTID)
new_size += sizeof(*bi);
@@ -930,7 +952,6 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
BUG_ON(ret);
ret = btrfs_extend_item(trans, root, path, new_size);
- BUG_ON(ret);
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1025,7 +1046,7 @@ again:
return 0;
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
key.type = BTRFS_EXTENT_REF_V0_KEY;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0) {
err = ret;
@@ -1063,7 +1084,7 @@ again:
if (match_extent_data_ref(leaf, ref, root_objectid,
owner, offset)) {
if (recow) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto again;
}
err = 0;
@@ -1124,7 +1145,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
if (match_extent_data_ref(leaf, ref, root_objectid,
owner, offset))
break;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
key.offset++;
ret = btrfs_insert_empty_item(trans, root, path, &key,
size);
@@ -1150,7 +1171,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
ret = 0;
fail:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return ret;
}
@@ -1276,7 +1297,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
ret = -ENOENT;
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
if (ret == -ENOENT && parent) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
key.type = BTRFS_EXTENT_REF_V0_KEY;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
@@ -1305,7 +1326,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
}
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return ret;
}
@@ -1538,7 +1559,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
size = btrfs_extent_inline_ref_size(type);
ret = btrfs_extend_item(trans, root, path, size);
- BUG_ON(ret);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
@@ -1591,7 +1611,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
if (ret != -ENOENT)
return ret;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
*ref_ret = NULL;
if (owner < BTRFS_FIRST_FREE_OBJECTID) {
@@ -1667,7 +1687,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
end - ptr - size);
item_size -= size;
ret = btrfs_truncate_item(trans, root, path, item_size, 1);
- BUG_ON(ret);
}
btrfs_mark_buffer_dirty(leaf);
return 0;
@@ -1740,39 +1759,45 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
return ret;
}
-static void btrfs_issue_discard(struct block_device *bdev,
+static int btrfs_issue_discard(struct block_device *bdev,
u64 start, u64 len)
{
- blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
+ return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
}
static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes)
+ u64 num_bytes, u64 *actual_bytes)
{
int ret;
- u64 map_length = num_bytes;
+ u64 discarded_bytes = 0;
struct btrfs_multi_bio *multi = NULL;
- if (!btrfs_test_opt(root, DISCARD))
- return 0;
/* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
- bytenr, &map_length, &multi, 0);
+ ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+ bytenr, &num_bytes, &multi, 0);
if (!ret) {
struct btrfs_bio_stripe *stripe = multi->stripes;
int i;
- if (map_length > num_bytes)
- map_length = num_bytes;
for (i = 0; i < multi->num_stripes; i++, stripe++) {
- btrfs_issue_discard(stripe->dev->bdev,
- stripe->physical,
- map_length);
+ ret = btrfs_issue_discard(stripe->dev->bdev,
+ stripe->physical,
+ stripe->length);
+ if (!ret)
+ discarded_bytes += stripe->length;
+ else if (ret != -EOPNOTSUPP)
+ break;
}
kfree(multi);
}
+ if (discarded_bytes && ret == -EOPNOTSUPP)
+ ret = 0;
+
+ if (actual_bytes)
+ *actual_bytes = discarded_bytes;
+
return ret;
}
@@ -1839,7 +1864,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
__run_delayed_extent_op(extent_op, leaf, item);
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(root->fs_info->extent_root, path);
+ btrfs_release_path(path);
path->reada = 1;
path->leave_spinning = 1;
@@ -2274,6 +2299,10 @@ again:
atomic_inc(&ref->refs);
spin_unlock(&delayed_refs->lock);
+ /*
+ * Mutex was contended, block until it's
+ * released and try again
+ */
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
@@ -2338,8 +2367,12 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
atomic_inc(&head->node.refs);
spin_unlock(&delayed_refs->lock);
- btrfs_release_path(root->fs_info->extent_root, path);
+ btrfs_release_path(path);
+ /*
+ * Mutex was contended, block until it's released and let
+ * caller try again
+ */
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref(&head->node);
@@ -2487,126 +2520,6 @@ out:
return ret;
}
-#if 0
-int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, u32 nr_extents)
-{
- struct btrfs_key key;
- struct btrfs_file_extent_item *fi;
- u64 root_gen;
- u32 nritems;
- int i;
- int level;
- int ret = 0;
- int shared = 0;
-
- if (!root->ref_cows)
- return 0;
-
- if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
- shared = 0;
- root_gen = root->root_key.offset;
- } else {
- shared = 1;
- root_gen = trans->transid - 1;
- }
-
- level = btrfs_header_level(buf);
- nritems = btrfs_header_nritems(buf);
-
- if (level == 0) {
- struct btrfs_leaf_ref *ref;
- struct btrfs_extent_info *info;
-
- ref = btrfs_alloc_leaf_ref(root, nr_extents);
- if (!ref) {
- ret = -ENOMEM;
- goto out;
- }
-
- ref->root_gen = root_gen;
- ref->bytenr = buf->start;
- ref->owner = btrfs_header_owner(buf);
- ref->generation = btrfs_header_generation(buf);
- ref->nritems = nr_extents;
- info = ref->extents;
-
- for (i = 0; nr_extents > 0 && i < nritems; i++) {
- u64 disk_bytenr;
- btrfs_item_key_to_cpu(buf, &key, i);
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
- continue;
- fi = btrfs_item_ptr(buf, i,
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(buf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- continue;
- disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
- if (disk_bytenr == 0)
- continue;
-
- info->bytenr = disk_bytenr;
- info->num_bytes =
- btrfs_file_extent_disk_num_bytes(buf, fi);
- info->objectid = key.objectid;
- info->offset = key.offset;
- info++;
- }
-
- ret = btrfs_add_leaf_ref(root, ref, shared);
- if (ret == -EEXIST && shared) {
- struct btrfs_leaf_ref *old;
- old = btrfs_lookup_leaf_ref(root, ref->bytenr);
- BUG_ON(!old);
- btrfs_remove_leaf_ref(root, old);
- btrfs_free_leaf_ref(root, old);
- ret = btrfs_add_leaf_ref(root, ref, shared);
- }
- WARN_ON(ret);
- btrfs_free_leaf_ref(root, ref);
- }
-out:
- return ret;
-}
-
-/* when a block goes through cow, we update the reference counts of
- * everything that block points to. The internal pointers of the block
- * can be in just about any order, and it is likely to have clusters of
- * things that are close together and clusters of things that are not.
- *
- * To help reduce the seeks that come with updating all of these reference
- * counts, sort them by byte number before actual updates are done.
- *
- * struct refsort is used to match byte number to slot in the btree block.
- * we sort based on the byte number and then use the slot to actually
- * find the item.
- *
- * struct refsort is smaller than strcut btrfs_item and smaller than
- * struct btrfs_key_ptr. Since we're currently limited to the page size
- * for a btree block, there's no way for a kmalloc of refsorts for a
- * single node to be bigger than a page.
- */
-struct refsort {
- u64 bytenr;
- u32 slot;
-};
-
-/*
- * for passing into sort()
- */
-static int refsort_cmp(const void *a_void, const void *b_void)
-{
- const struct refsort *a = a_void;
- const struct refsort *b = b_void;
-
- if (a->bytenr < b->bytenr)
- return -1;
- if (a->bytenr > b->bytenr)
- return 1;
- return 0;
-}
-#endif
-
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2709,7 +2622,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(extent_root, path);
+ btrfs_release_path(path);
fail:
if (ret)
return ret;
@@ -2762,7 +2675,7 @@ again:
inode = lookup_free_space_inode(root, block_group, path);
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
ret = PTR_ERR(inode);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto out;
}
@@ -2831,7 +2744,7 @@ again:
out_put:
iput(inode);
out_free:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
out:
spin_lock(&block_group->lock);
block_group->disk_cache_state = dcs;
@@ -3015,7 +2928,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_readonly = 0;
found->bytes_may_use = 0;
found->full = 0;
- found->force_alloc = 0;
+ found->force_alloc = CHUNK_ALLOC_NO_FORCE;
+ found->chunk_alloc = 0;
*space_info = found;
list_add_rcu(&found->list, &info->space_info);
atomic_set(&found->caching_threads, 0);
@@ -3120,7 +3034,8 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
/* make sure bytes are sectorsize aligned */
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
- if (root == root->fs_info->tree_root) {
+ if (root == root->fs_info->tree_root ||
+ BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
alloc_chunk = 0;
committed = 1;
}
@@ -3146,7 +3061,7 @@ again:
if (!data_sinfo->full && alloc_chunk) {
u64 alloc_target;
- data_sinfo->force_alloc = 1;
+ data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
spin_unlock(&data_sinfo->lock);
alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
@@ -3156,7 +3071,8 @@ alloc:
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
bytes + 2 * 1024 * 1024,
- alloc_target, 0);
+ alloc_target,
+ CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans, root);
if (ret < 0) {
if (ret != -ENOSPC)
@@ -3186,18 +3102,6 @@ commit_trans:
goto again;
}
-#if 0 /* I hope we never need this code again, just in case */
- printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
- "%llu bytes_reserved, " "%llu bytes_pinned, "
- "%llu bytes_readonly, %llu may use %llu total\n",
- (unsigned long long)bytes,
- (unsigned long long)data_sinfo->bytes_used,
- (unsigned long long)data_sinfo->bytes_reserved,
- (unsigned long long)data_sinfo->bytes_pinned,
- (unsigned long long)data_sinfo->bytes_readonly,
- (unsigned long long)data_sinfo->bytes_may_use,
- (unsigned long long)data_sinfo->total_bytes);
-#endif
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
@@ -3235,31 +3139,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
- found->force_alloc = 1;
+ found->force_alloc = CHUNK_ALLOC_FORCE;
}
rcu_read_unlock();
}
static int should_alloc_chunk(struct btrfs_root *root,
- struct btrfs_space_info *sinfo, u64 alloc_bytes)
+ struct btrfs_space_info *sinfo, u64 alloc_bytes,
+ int force)
{
u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+ u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
u64 thresh;
- if (sinfo->bytes_used + sinfo->bytes_reserved +
- alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+ if (force == CHUNK_ALLOC_FORCE)
+ return 1;
+
+ /*
+ * in limited mode, we want to have some free space up to
+ * about 1% of the FS size.
+ */
+ if (force == CHUNK_ALLOC_LIMITED) {
+ thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+ thresh = max_t(u64, 64 * 1024 * 1024,
+ div_factor_fine(thresh, 1));
+
+ if (num_bytes - num_allocated < thresh)
+ return 1;
+ }
+
+ /*
+ * we have two similar checks here, one based on percentage
+ * and once based on a hard number of 256MB. The idea
+ * is that if we have a good amount of free
+ * room, don't allocate a chunk. A good mount is
+ * less than 80% utilized of the chunks we have allocated,
+ * or more than 256MB free
+ */
+ if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
return 0;
- if (sinfo->bytes_used + sinfo->bytes_reserved +
- alloc_bytes < div_factor(num_bytes, 8))
+ if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
return 0;
thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+
+ /* 256MB or 5% of the FS */
thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
return 0;
-
return 1;
}
@@ -3269,10 +3198,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
{
struct btrfs_space_info *space_info;
struct btrfs_fs_info *fs_info = extent_root->fs_info;
+ int wait_for_alloc = 0;
int ret = 0;
- mutex_lock(&fs_info->chunk_mutex);
-
flags = btrfs_reduce_alloc_profile(extent_root, flags);
space_info = __find_space_info(extent_root->fs_info, flags);
@@ -3283,21 +3211,40 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
}
BUG_ON(!space_info);
+again:
spin_lock(&space_info->lock);
if (space_info->force_alloc)
- force = 1;
+ force = space_info->force_alloc;
if (space_info->full) {
spin_unlock(&space_info->lock);
- goto out;
+ return 0;
}
- if (!force && !should_alloc_chunk(extent_root, space_info,
- alloc_bytes)) {
+ if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
spin_unlock(&space_info->lock);
- goto out;
+ return 0;
+ } else if (space_info->chunk_alloc) {
+ wait_for_alloc = 1;
+ } else {
+ space_info->chunk_alloc = 1;
}
+
spin_unlock(&space_info->lock);
+ mutex_lock(&fs_info->chunk_mutex);
+
+ /*
+ * The chunk_mutex is held throughout the entirety of a chunk
+ * allocation, so once we've acquired the chunk_mutex we know that the
+ * other guy is done and we need to recheck and see if we should
+ * allocate.
+ */
+ if (wait_for_alloc) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ wait_for_alloc = 0;
+ goto again;
+ }
+
/*
* If we have mixed data/metadata chunks we want to make sure we keep
* allocating mixed chunks instead of individual chunks.
@@ -3323,9 +3270,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
space_info->full = 1;
else
ret = 1;
- space_info->force_alloc = 0;
+
+ space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+ space_info->chunk_alloc = 0;
spin_unlock(&space_info->lock);
-out:
mutex_unlock(&extent_root->fs_info->chunk_mutex);
return ret;
}
@@ -3342,19 +3290,24 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
u64 max_reclaim;
u64 reclaimed = 0;
long time_left;
- int pause = 1;
int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
int loops = 0;
+ unsigned long progress;
block_rsv = &root->fs_info->delalloc_block_rsv;
space_info = block_rsv->space_info;
smp_mb();
reserved = space_info->bytes_reserved;
+ progress = space_info->reservation_progress;
if (reserved == 0)
return 0;
+ /* nothing to shrink - nothing to reclaim */
+ if (root->fs_info->delalloc_bytes == 0)
+ return 0;
+
max_reclaim = min(reserved, to_reclaim);
while (loops < 1024) {
@@ -3365,31 +3318,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
spin_lock(&space_info->lock);
- if (reserved > space_info->bytes_reserved) {
- loops = 0;
+ if (reserved > space_info->bytes_reserved)
reclaimed += reserved - space_info->bytes_reserved;
- } else {
- loops++;
- }
reserved = space_info->bytes_reserved;
spin_unlock(&space_info->lock);
+ loops++;
+
if (reserved == 0 || reclaimed >= max_reclaim)
break;
if (trans && trans->transaction->blocked)
return -EAGAIN;
- __set_current_state(TASK_INTERRUPTIBLE);
- time_left = schedule_timeout(pause);
+ time_left = schedule_timeout_interruptible(1);
/* We were interrupted, exit */
if (time_left)
break;
- pause <<= 1;
- if (pause > HZ / 10)
- pause = HZ / 10;
+ /* we've kicked the IO a few times, if anything has been freed,
+ * exit. There is no sense in looping here for a long time
+ * when we really need to commit the transaction, or there are
+ * just too many writers without enough free space
+ */
+
+ if (loops > 3) {
+ smp_mb();
+ if (progress != space_info->reservation_progress)
+ break;
+ }
}
return reclaimed >= to_reclaim;
@@ -3576,8 +3534,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
-void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
- struct btrfs_block_rsv *dest, u64 num_bytes)
+static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+ struct btrfs_block_rsv *dest, u64 num_bytes)
{
struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3612,6 +3570,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
if (num_bytes) {
spin_lock(&space_info->lock);
space_info->bytes_reserved -= num_bytes;
+ space_info->reservation_progress++;
spin_unlock(&space_info->lock);
}
}
@@ -3779,23 +3738,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
u64 meta_used;
u64 data_used;
int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
-#if 0
- /*
- * per tree used space accounting can be inaccuracy, so we
- * can't rely on it.
- */
- spin_lock(&fs_info->extent_root->accounting_lock);
- num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
- spin_unlock(&fs_info->extent_root->accounting_lock);
-
- spin_lock(&fs_info->csum_root->accounting_lock);
- num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
- spin_unlock(&fs_info->csum_root->accounting_lock);
- spin_lock(&fs_info->tree_root->accounting_lock);
- num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
- spin_unlock(&fs_info->tree_root->accounting_lock);
-#endif
sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
spin_lock(&sinfo->lock);
data_used = sinfo->bytes_used;
@@ -3844,13 +3787,11 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
if (block_rsv->reserved >= block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
sinfo->bytes_reserved -= num_bytes;
+ sinfo->reservation_progress++;
block_rsv->reserved = block_rsv->size;
block_rsv->full = 1;
}
-#if 0
- printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
- block_rsv->size, block_rsv->reserved);
-#endif
+
spin_unlock(&sinfo->lock);
spin_unlock(&block_rsv->lock);
}
@@ -3896,12 +3837,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
}
-static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
-{
- return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
- 3 * num_items;
-}
-
int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
int num_items)
@@ -3912,7 +3847,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
if (num_items == 0 || root->fs_info->chunk_root == root)
return 0;
- num_bytes = calc_trans_metadata_size(root, num_items);
+ num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
num_bytes);
if (!ret) {
@@ -3951,14 +3886,14 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
* If all of the metadata space is used, we can commit
* transaction and use space it freed.
*/
- u64 num_bytes = calc_trans_metadata_size(root, 4);
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
void btrfs_orphan_release_metadata(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 num_bytes = calc_trans_metadata_size(root, 4);
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}
@@ -3972,7 +3907,7 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
* two for root back/forward refs, two for directory entries
* and one for root of the snapshot.
*/
- u64 num_bytes = calc_trans_metadata_size(root, 5);
+ u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
dst_rsv->space_info = src_rsv->space_info;
return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
}
@@ -3988,6 +3923,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
u64 to_reserve;
int nr_extents;
+ int reserved_extents;
int ret;
if (btrfs_transaction_in_commit(root->fs_info))
@@ -3995,26 +3931,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
num_bytes = ALIGN(num_bytes, root->sectorsize);
- spin_lock(&BTRFS_I(inode)->accounting_lock);
nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
- if (nr_extents > BTRFS_I(inode)->reserved_extents) {
- nr_extents -= BTRFS_I(inode)->reserved_extents;
- to_reserve = calc_trans_metadata_size(root, nr_extents);
+ reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
+
+ if (nr_extents > reserved_extents) {
+ nr_extents -= reserved_extents;
+ to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
} else {
nr_extents = 0;
to_reserve = 0;
}
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
to_reserve += calc_csum_metadata_size(inode, num_bytes);
ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
if (ret)
return ret;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->reserved_extents += nr_extents;
+ atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
atomic_inc(&BTRFS_I(inode)->outstanding_extents);
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
block_rsv_add_bytes(block_rsv, to_reserve, 1);
@@ -4029,24 +3963,34 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 to_free;
int nr_extents;
+ int reserved_extents;
num_bytes = ALIGN(num_bytes, root->sectorsize);
atomic_dec(&BTRFS_I(inode)->outstanding_extents);
WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
- if (nr_extents < BTRFS_I(inode)->reserved_extents) {
- nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
- BTRFS_I(inode)->reserved_extents -= nr_extents;
- } else {
- nr_extents = 0;
- }
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
+ do {
+ int old, new;
+
+ nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+ if (nr_extents >= reserved_extents) {
+ nr_extents = 0;
+ break;
+ }
+ old = reserved_extents;
+ nr_extents = reserved_extents - nr_extents;
+ new = reserved_extents - nr_extents;
+ old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
+ reserved_extents, new);
+ if (likely(old == reserved_extents))
+ break;
+ reserved_extents = old;
+ } while (1);
to_free = calc_csum_metadata_size(inode, num_bytes);
if (nr_extents > 0)
- to_free += calc_trans_metadata_size(root, nr_extents);
+ to_free += btrfs_calc_trans_metadata_size(root, nr_extents);
btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
to_free);
@@ -4133,6 +4077,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
btrfs_set_block_group_used(&cache->item, old_val);
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
+ cache->space_info->reservation_progress++;
cache->space_info->bytes_used += num_bytes;
cache->space_info->disk_used += num_bytes * factor;
spin_unlock(&cache->lock);
@@ -4184,6 +4129,7 @@ static int pin_down_extent(struct btrfs_root *root,
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
+ cache->space_info->reservation_progress++;
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@@ -4214,8 +4160,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
* update size of reserved extents. this function may return -EAGAIN
* if 'reserve' is true or 'sinfo' is false.
*/
-static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve, int sinfo)
+int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve, int sinfo)
{
int ret = 0;
if (sinfo) {
@@ -4234,6 +4180,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+ space_info->reservation_progress++;
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -4353,7 +4300,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
if (ret)
break;
- ret = btrfs_discard_extent(root, start, end + 1 - start);
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_discard_extent(root, start,
+ end + 1 - start, NULL);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
unpin_extent_range(root, start, end);
@@ -4450,7 +4399,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
NULL, refs_to_drop,
is_data);
BUG_ON(ret);
- btrfs_release_path(extent_root, path);
+ btrfs_release_path(path);
path->leave_spinning = 1;
key.objectid = bytenr;
@@ -4489,7 +4438,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
owner_objectid, 0);
BUG_ON(ret < 0);
- btrfs_release_path(extent_root, path);
+ btrfs_release_path(path);
path->leave_spinning = 1;
key.objectid = bytenr;
@@ -4559,7 +4508,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
BUG_ON(ret);
- btrfs_release_path(extent_root, path);
+ btrfs_release_path(path);
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
@@ -4694,10 +4643,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- ret = update_reserved_bytes(cache, buf->len, 0, 0);
+ ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
if (ret == -EAGAIN) {
/* block group became read-only */
- update_reserved_bytes(cache, buf->len, 0, 1);
+ btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
goto out;
}
@@ -4712,6 +4661,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (ret) {
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_reserved -= buf->len;
+ cache->space_info->reservation_progress++;
spin_unlock(&cache->space_info->lock);
}
goto out;
@@ -4733,6 +4683,11 @@ pin:
}
}
out:
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't matter
+ * anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
btrfs_put_block_group(cache);
}
@@ -4796,7 +4751,7 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
return 0;
wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
- (cache->free_space >= num_bytes));
+ (cache->free_space_ctl->free_space >= num_bytes));
put_caching_control(caching_ctl);
return 0;
@@ -5180,7 +5135,7 @@ checks:
search_start - offset);
BUG_ON(offset > search_start);
- ret = update_reserved_bytes(block_group, num_bytes, 1,
+ ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
(data & BTRFS_BLOCK_GROUP_DATA));
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
@@ -5271,11 +5226,13 @@ loop:
if (allowed_chunk_alloc) {
ret = do_chunk_alloc(trans, root, num_bytes +
- 2 * 1024 * 1024, data, 1);
+ 2 * 1024 * 1024, data,
+ CHUNK_ALLOC_LIMITED);
allowed_chunk_alloc = 0;
done_chunk_alloc = 1;
- } else if (!done_chunk_alloc) {
- space_info->force_alloc = 1;
+ } else if (!done_chunk_alloc &&
+ space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
+ space_info->force_alloc = CHUNK_ALLOC_LIMITED;
}
if (loop < LOOP_NO_EMPTY_SIZE) {
@@ -5361,7 +5318,8 @@ again:
*/
if (empty_size || root->ref_cows)
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes + 2 * 1024 * 1024, data, 0);
+ num_bytes + 2 * 1024 * 1024, data,
+ CHUNK_ALLOC_NO_FORCE);
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -5373,10 +5331,10 @@ again:
num_bytes = num_bytes & ~(root->sectorsize - 1);
num_bytes = max(num_bytes, min_alloc_size);
do_chunk_alloc(trans, root->fs_info->extent_root,
- num_bytes, data, 1);
+ num_bytes, data, CHUNK_ALLOC_FORCE);
goto again;
}
- if (ret == -ENOSPC) {
+ if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
struct btrfs_space_info *sinfo;
sinfo = __find_space_info(root->fs_info, data);
@@ -5386,6 +5344,8 @@ again:
dump_space_info(sinfo, num_bytes, 1);
}
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
+
return ret;
}
@@ -5401,12 +5361,15 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
return -ENOSPC;
}
- ret = btrfs_discard_extent(root, start, len);
+ if (btrfs_test_opt(root, DISCARD))
+ ret = btrfs_discard_extent(root, start, len, NULL);
btrfs_add_free_space(cache, start, len);
- update_reserved_bytes(cache, len, 0, 1);
+ btrfs_update_reserved_bytes(cache, len, 0, 1);
btrfs_put_block_group(cache);
+ trace_btrfs_reserved_extent_free(root, start, len);
+
return ret;
}
@@ -5433,7 +5396,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5603,7 +5567,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
put_caching_control(caching_ctl);
}
- ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+ ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
BUG_ON(ret);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -6036,6 +6000,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
next = read_tree_block(root, bytenr, blocksize, generation);
+ if (!next)
+ return -EIO;
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
}
@@ -6372,7 +6338,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
trans->block_rsv = block_rsv;
}
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
BUG_ON(err);
ret = btrfs_del_root(trans, tree_root, &root->root_key);
@@ -6427,10 +6393,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
wc = kzalloc(sizeof(*wc), GFP_NOFS);
- BUG_ON(!wc);
+ if (!wc) {
+ btrfs_free_path(path);
+ return -ENOMEM;
+ }
btrfs_assert_tree_locked(parent);
parent_level = btrfs_header_level(parent);
@@ -6472,1495 +6442,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return ret;
}
-#if 0
-static unsigned long calc_ra(unsigned long start, unsigned long last,
- unsigned long nr)
-{
- return min(last, start + nr - 1);
-}
-
-static noinline int relocate_inode_pages(struct inode *inode, u64 start,
- u64 len)
-{
- u64 page_start;
- u64 page_end;
- unsigned long first_index;
- unsigned long last_index;
- unsigned long i;
- struct page *page;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct file_ra_state *ra;
- struct btrfs_ordered_extent *ordered;
- unsigned int total_read = 0;
- unsigned int total_dirty = 0;
- int ret = 0;
-
- ra = kzalloc(sizeof(*ra), GFP_NOFS);
- if (!ra)
- return -ENOMEM;
-
- mutex_lock(&inode->i_mutex);
- first_index = start >> PAGE_CACHE_SHIFT;
- last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
-
- /* make sure the dirty trick played by the caller work */
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- first_index, last_index);
- if (ret)
- goto out_unlock;
-
- file_ra_state_init(ra, inode->i_mapping);
-
- for (i = first_index ; i <= last_index; i++) {
- if (total_read % ra->ra_pages == 0) {
- btrfs_force_ra(inode->i_mapping, ra, NULL, i,
- calc_ra(i, last_index, ra->ra_pages));
- }
- total_read++;
-again:
- if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
- BUG_ON(1);
- page = grab_cache_page(inode->i_mapping, i);
- if (!page) {
- ret = -ENOMEM;
- goto out_unlock;
- }
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- page_cache_release(page);
- ret = -EIO;
- goto out_unlock;
- }
- }
- wait_on_page_writeback(page);
-
- page_start = (u64)page->index << PAGE_CACHE_SHIFT;
- page_end = page_start + PAGE_CACHE_SIZE - 1;
- lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
- unlock_page(page);
- page_cache_release(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
- set_page_extent_mapped(page);
-
- if (i == first_index)
- set_extent_bits(io_tree, page_start, page_end,
- EXTENT_BOUNDARY, GFP_NOFS);
- btrfs_set_extent_delalloc(inode, page_start, page_end);
-
- set_page_dirty(page);
- total_dirty++;
-
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
- unlock_page(page);
- page_cache_release(page);
- }
-
-out_unlock:
- kfree(ra);
- mutex_unlock(&inode->i_mutex);
- balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
- return ret;
-}
-
-static noinline int relocate_data_extent(struct inode *reloc_inode,
- struct btrfs_key *extent_key,
- u64 offset)
-{
- struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
- struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
- struct extent_map *em;
- u64 start = extent_key->objectid - offset;
- u64 end = start + extent_key->offset - 1;
-
- em = alloc_extent_map(GFP_NOFS);
- BUG_ON(!em || IS_ERR(em));
-
- em->start = start;
- em->len = extent_key->offset;
- em->block_len = extent_key->offset;
- em->block_start = extent_key->objectid;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
-
- /* setup extent map to cheat btrfs_readpage */
- lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
- while (1) {
- int ret;
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- if (ret != -EEXIST) {
- free_extent_map(em);
- break;
- }
- btrfs_drop_extent_cache(reloc_inode, start, end, 0);
- }
- unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
-
- return relocate_inode_pages(reloc_inode, start, extent_key->offset);
-}
-
-struct btrfs_ref_path {
- u64 extent_start;
- u64 nodes[BTRFS_MAX_LEVEL];
- u64 root_objectid;
- u64 root_generation;
- u64 owner_objectid;
- u32 num_refs;
- int lowest_level;
- int current_level;
- int shared_level;
-
- struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
- u64 new_nodes[BTRFS_MAX_LEVEL];
-};
-
-struct disk_extent {
- u64 ram_bytes;
- u64 disk_bytenr;
- u64 disk_num_bytes;
- u64 offset;
- u64 num_bytes;
- u8 compression;
- u8 encryption;
- u16 other_encoding;
-};
-
-static int is_cowonly_root(u64 root_objectid)
-{
- if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
- root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
- root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
- root_objectid == BTRFS_DEV_TREE_OBJECTID ||
- root_objectid == BTRFS_TREE_LOG_OBJECTID ||
- root_objectid == BTRFS_CSUM_TREE_OBJECTID)
- return 1;
- return 0;
-}
-
-static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_ref_path *ref_path,
- int first_time)
-{
- struct extent_buffer *leaf;
- struct btrfs_path *path;
- struct btrfs_extent_ref *ref;
- struct btrfs_key key;
- struct btrfs_key found_key;
- u64 bytenr;
- u32 nritems;
- int level;
- int ret = 1;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- if (first_time) {
- ref_path->lowest_level = -1;
- ref_path->current_level = -1;
- ref_path->shared_level = -1;
- goto walk_up;
- }
-walk_down:
- level = ref_path->current_level - 1;
- while (level >= -1) {
- u64 parent;
- if (level < ref_path->lowest_level)
- break;
-
- if (level >= 0)
- bytenr = ref_path->nodes[level];
- else
- bytenr = ref_path->extent_start;
- BUG_ON(bytenr == 0);
-
- parent = ref_path->nodes[level + 1];
- ref_path->nodes[level + 1] = 0;
- ref_path->current_level = level;
- BUG_ON(parent == 0);
-
- key.objectid = bytenr;
- key.offset = parent + 1;
- key.type = BTRFS_EXTENT_REF_KEY;
-
- ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
- BUG_ON(ret == 0);
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(extent_root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- goto next;
- leaf = path->nodes[0];
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid == bytenr &&
- found_key.type == BTRFS_EXTENT_REF_KEY) {
- if (level < ref_path->shared_level)
- ref_path->shared_level = level;
- goto found;
- }
-next:
- level--;
- btrfs_release_path(extent_root, path);
- cond_resched();
- }
- /* reached lowest level */
- ret = 1;
- goto out;
-walk_up:
- level = ref_path->current_level;
- while (level < BTRFS_MAX_LEVEL - 1) {
- u64 ref_objectid;
-
- if (level >= 0)
- bytenr = ref_path->nodes[level];
- else
- bytenr = ref_path->extent_start;
-
- BUG_ON(bytenr == 0);
-
- key.objectid = bytenr;
- key.offset = 0;
- key.type = BTRFS_EXTENT_REF_KEY;
-
- ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(extent_root, path);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- /* the extent was freed by someone */
- if (ref_path->lowest_level == level)
- goto out;
- btrfs_release_path(extent_root, path);
- goto walk_down;
- }
- leaf = path->nodes[0];
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid != bytenr ||
- found_key.type != BTRFS_EXTENT_REF_KEY) {
- /* the extent was freed by someone */
- if (ref_path->lowest_level == level) {
- ret = 1;
- goto out;
- }
- btrfs_release_path(extent_root, path);
- goto walk_down;
- }
-found:
- ref = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_ref);
- ref_objectid = btrfs_ref_objectid(leaf, ref);
- if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
- if (first_time) {
- level = (int)ref_objectid;
- BUG_ON(level >= BTRFS_MAX_LEVEL);
- ref_path->lowest_level = level;
- ref_path->current_level = level;
- ref_path->nodes[level] = bytenr;
- } else {
- WARN_ON(ref_objectid != level);
- }
- } else {
- WARN_ON(level != -1);
- }
- first_time = 0;
-
- if (ref_path->lowest_level == level) {
- ref_path->owner_objectid = ref_objectid;
- ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
- }
-
- /*
- * the block is tree root or the block isn't in reference
- * counted tree.
- */
- if (found_key.objectid == found_key.offset ||
- is_cowonly_root(btrfs_ref_root(leaf, ref))) {
- ref_path->root_objectid = btrfs_ref_root(leaf, ref);
- ref_path->root_generation =
- btrfs_ref_generation(leaf, ref);
- if (level < 0) {
- /* special reference from the tree log */
- ref_path->nodes[0] = found_key.offset;
- ref_path->current_level = 0;
- }
- ret = 0;
- goto out;
- }
-
- level++;
- BUG_ON(ref_path->nodes[level] != 0);
- ref_path->nodes[level] = found_key.offset;
- ref_path->current_level = level;
-
- /*
- * the reference was created in the running transaction,
- * no need to continue walking up.
- */
- if (btrfs_ref_generation(leaf, ref) == trans->transid) {
- ref_path->root_objectid = btrfs_ref_root(leaf, ref);
- ref_path->root_generation =
- btrfs_ref_generation(leaf, ref);
- ret = 0;
- goto out;
- }
-
- btrfs_release_path(extent_root, path);
- cond_resched();
- }
- /* reached max tree level, but no tree root found. */
- BUG();
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_ref_path *ref_path,
- u64 extent_start)
-{
- memset(ref_path, 0, sizeof(*ref_path));
- ref_path->extent_start = extent_start;
-
- return __next_ref_path(trans, extent_root, ref_path, 1);
-}
-
-static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_ref_path *ref_path)
-{
- return __next_ref_path(trans, extent_root, ref_path, 0);
-}
-
-static noinline int get_new_locations(struct inode *reloc_inode,
- struct btrfs_key *extent_key,
- u64 offset, int no_fragment,
- struct disk_extent **extents,
- int *nr_extents)
-{
- struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
- struct btrfs_path *path;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *leaf;
- struct disk_extent *exts = *extents;
- struct btrfs_key found_key;
- u64 cur_pos;
- u64 last_byte;
- u32 nritems;
- int nr = 0;
- int max = *nr_extents;
- int ret;
-
- WARN_ON(!no_fragment && *extents);
- if (!exts) {
- max = 1;
- exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
- if (!exts)
- return -ENOMEM;
- }
-
- path = btrfs_alloc_path();
- BUG_ON(!path);
-
- cur_pos = extent_key->objectid - offset;
- last_byte = extent_key->objectid + extent_key->offset;
- ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
- cur_pos, 0);
- if (ret < 0)
- goto out;
- if (ret > 0) {
- ret = -ENOENT;
- goto out;
- }
-
- while (1) {
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
- leaf = path->nodes[0];
- }
-
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.offset != cur_pos ||
- found_key.type != BTRFS_EXTENT_DATA_KEY ||
- found_key.objectid != reloc_inode->i_ino)
- break;
-
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) !=
- BTRFS_FILE_EXTENT_REG ||
- btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
- break;
-
- if (nr == max) {
- struct disk_extent *old = exts;
- max *= 2;
- exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
- memcpy(exts, old, sizeof(*exts) * nr);
- if (old != *extents)
- kfree(old);
- }
-
- exts[nr].disk_bytenr =
- btrfs_file_extent_disk_bytenr(leaf, fi);
- exts[nr].disk_num_bytes =
- btrfs_file_extent_disk_num_bytes(leaf, fi);
- exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
- exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
- exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
- exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
- exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
- exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
- fi);
- BUG_ON(exts[nr].offset > 0);
- BUG_ON(exts[nr].compression || exts[nr].encryption);
- BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
-
- cur_pos += exts[nr].num_bytes;
- nr++;
-
- if (cur_pos + offset >= last_byte)
- break;
-
- if (no_fragment) {
- ret = 1;
- goto out;
- }
- path->slots[0]++;
- }
-
- BUG_ON(cur_pos + offset > last_byte);
- if (cur_pos + offset < last_byte) {
- ret = -ENOENT;
- goto out;
- }
- ret = 0;
-out:
- btrfs_free_path(path);
- if (ret) {
- if (exts != *extents)
- kfree(exts);
- } else {
- *extents = exts;
- *nr_extents = nr;
- }
- return ret;
-}
-
-static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *extent_key,
- struct btrfs_key *leaf_key,
- struct btrfs_ref_path *ref_path,
- struct disk_extent *new_extents,
- int nr_extents)
-{
- struct extent_buffer *leaf;
- struct btrfs_file_extent_item *fi;
- struct inode *inode = NULL;
- struct btrfs_key key;
- u64 lock_start = 0;
- u64 lock_end = 0;
- u64 num_bytes;
- u64 ext_offset;
- u64 search_end = (u64)-1;
- u32 nritems;
- int nr_scaned = 0;
- int extent_locked = 0;
- int extent_type;
- int ret;
-
- memcpy(&key, leaf_key, sizeof(key));
- if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
- if (key.objectid < ref_path->owner_objectid ||
- (key.objectid == ref_path->owner_objectid &&
- key.type < BTRFS_EXTENT_DATA_KEY)) {
- key.objectid = ref_path->owner_objectid;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = 0;
- }
- }
-
- while (1) {
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0)
- goto out;
-
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
-next:
- if (extent_locked && ret > 0) {
- /*
- * the file extent item was modified by someone
- * before the extent got locked.
- */
- unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
- lock_end, GFP_NOFS);
- extent_locked = 0;
- }
-
- if (path->slots[0] >= nritems) {
- if (++nr_scaned > 2)
- break;
-
- BUG_ON(extent_locked);
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- }
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
- if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
- if ((key.objectid > ref_path->owner_objectid) ||
- (key.objectid == ref_path->owner_objectid &&
- key.type > BTRFS_EXTENT_DATA_KEY) ||
- key.offset >= search_end)
- break;
- }
-
- if (inode && key.objectid != inode->i_ino) {
- BUG_ON(extent_locked);
- btrfs_release_path(root, path);
- mutex_unlock(&inode->i_mutex);
- iput(inode);
- inode = NULL;
- continue;
- }
-
- if (key.type != BTRFS_EXTENT_DATA_KEY) {
- path->slots[0]++;
- ret = 1;
- goto next;
- }
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- extent_type = btrfs_file_extent_type(leaf, fi);
- if ((extent_type != BTRFS_FILE_EXTENT_REG &&
- extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
- (btrfs_file_extent_disk_bytenr(leaf, fi) !=
- extent_key->objectid)) {
- path->slots[0]++;
- ret = 1;
- goto next;
- }
-
- num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
- ext_offset = btrfs_file_extent_offset(leaf, fi);
-
- if (search_end == (u64)-1) {
- search_end = key.offset - ext_offset +
- btrfs_file_extent_ram_bytes(leaf, fi);
- }
-
- if (!extent_locked) {
- lock_start = key.offset;
- lock_end = lock_start + num_bytes - 1;
- } else {
- if (lock_start > key.offset ||
- lock_end + 1 < key.offset + num_bytes) {
- unlock_extent(&BTRFS_I(inode)->io_tree,
- lock_start, lock_end, GFP_NOFS);
- extent_locked = 0;
- }
- }
-
- if (!inode) {
- btrfs_release_path(root, path);
-
- inode = btrfs_iget_locked(root->fs_info->sb,
- key.objectid, root);
- if (inode->i_state & I_NEW) {
- BTRFS_I(inode)->root = root;
- BTRFS_I(inode)->location.objectid =
- key.objectid;
- BTRFS_I(inode)->location.type =
- BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.offset = 0;
- btrfs_read_locked_inode(inode);
- unlock_new_inode(inode);
- }
- /*
- * some code call btrfs_commit_transaction while
- * holding the i_mutex, so we can't use mutex_lock
- * here.
- */
- if (is_bad_inode(inode) ||
- !mutex_trylock(&inode->i_mutex)) {
- iput(inode);
- inode = NULL;
- key.offset = (u64)-1;
- goto skip;
- }
- }
-
- if (!extent_locked) {
- struct btrfs_ordered_extent *ordered;
-
- btrfs_release_path(root, path);
-
- lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
- lock_end, GFP_NOFS);
- ordered = btrfs_lookup_first_ordered_extent(inode,
- lock_end);
- if (ordered &&
- ordered->file_offset <= lock_end &&
- ordered->file_offset + ordered->len > lock_start) {
- unlock_extent(&BTRFS_I(inode)->io_tree,
- lock_start, lock_end, GFP_NOFS);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- key.offset += num_bytes;
- goto skip;
- }
- if (ordered)
- btrfs_put_ordered_extent(ordered);
-
- extent_locked = 1;
- continue;
- }
-
- if (nr_extents == 1) {
- /* update extent pointer in place */
- btrfs_set_file_extent_disk_bytenr(leaf, fi,
- new_extents[0].disk_bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, fi,
- new_extents[0].disk_num_bytes);
- btrfs_mark_buffer_dirty(leaf);
-
- btrfs_drop_extent_cache(inode, key.offset,
- key.offset + num_bytes - 1, 0);
-
- ret = btrfs_inc_extent_ref(trans, root,
- new_extents[0].disk_bytenr,
- new_extents[0].disk_num_bytes,
- leaf->start,
- root->root_key.objectid,
- trans->transid,
- key.objectid);
- BUG_ON(ret);
-
- ret = btrfs_free_extent(trans, root,
- extent_key->objectid,
- extent_key->offset,
- leaf->start,
- btrfs_header_owner(leaf),
- btrfs_header_generation(leaf),
- key.objectid, 0);
- BUG_ON(ret);
-
- btrfs_release_path(root, path);
- key.offset += num_bytes;
- } else {
- BUG_ON(1);
-#if 0
- u64 alloc_hint;
- u64 extent_len;
- int i;
- /*
- * drop old extent pointer at first, then insert the
- * new pointers one bye one
- */
- btrfs_release_path(root, path);
- ret = btrfs_drop_extents(trans, root, inode, key.offset,
- key.offset + num_bytes,
- key.offset, &alloc_hint);
- BUG_ON(ret);
-
- for (i = 0; i < nr_extents; i++) {
- if (ext_offset >= new_extents[i].num_bytes) {
- ext_offset -= new_extents[i].num_bytes;
- continue;
- }
- extent_len = min(new_extents[i].num_bytes -
- ext_offset, num_bytes);
-
- ret = btrfs_insert_empty_item(trans, root,
- path, &key,
- sizeof(*fi));
- BUG_ON(ret);
-
- leaf = path->nodes[0];
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
- btrfs_set_file_extent_generation(leaf, fi,
- trans->transid);
- btrfs_set_file_extent_type(leaf, fi,
- BTRFS_FILE_EXTENT_REG);
- btrfs_set_file_extent_disk_bytenr(leaf, fi,
- new_extents[i].disk_bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, fi,
- new_extents[i].disk_num_bytes);
- btrfs_set_file_extent_ram_bytes(leaf, fi,
- new_extents[i].ram_bytes);
-
- btrfs_set_file_extent_compression(leaf, fi,
- new_extents[i].compression);
- btrfs_set_file_extent_encryption(leaf, fi,
- new_extents[i].encryption);
- btrfs_set_file_extent_other_encoding(leaf, fi,
- new_extents[i].other_encoding);
-
- btrfs_set_file_extent_num_bytes(leaf, fi,
- extent_len);
- ext_offset += new_extents[i].offset;
- btrfs_set_file_extent_offset(leaf, fi,
- ext_offset);
- btrfs_mark_buffer_dirty(leaf);
-
- btrfs_drop_extent_cache(inode, key.offset,
- key.offset + extent_len - 1, 0);
-
- ret = btrfs_inc_extent_ref(trans, root,
- new_extents[i].disk_bytenr,
- new_extents[i].disk_num_bytes,
- leaf->start,
- root->root_key.objectid,
- trans->transid, key.objectid);
- BUG_ON(ret);
- btrfs_release_path(root, path);
-
- inode_add_bytes(inode, extent_len);
-
- ext_offset = 0;
- num_bytes -= extent_len;
- key.offset += extent_len;
-
- if (num_bytes == 0)
- break;
- }
- BUG_ON(i >= nr_extents);
-#endif
- }
-
- if (extent_locked) {
- unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
- lock_end, GFP_NOFS);
- extent_locked = 0;
- }
-skip:
- if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
- key.offset >= search_end)
- break;
-
- cond_resched();
- }
- ret = 0;
-out:
- btrfs_release_path(root, path);
- if (inode) {
- mutex_unlock(&inode->i_mutex);
- if (extent_locked) {
- unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
- lock_end, GFP_NOFS);
- }
- iput(inode);
- }
- return ret;
-}
-
-int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf, u64 orig_start)
-{
- int level;
- int ret;
-
- BUG_ON(btrfs_header_generation(buf) != trans->transid);
- BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
-
- level = btrfs_header_level(buf);
- if (level == 0) {
- struct btrfs_leaf_ref *ref;
- struct btrfs_leaf_ref *orig_ref;
-
- orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
- if (!orig_ref)
- return -ENOENT;
-
- ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
- if (!ref) {
- btrfs_free_leaf_ref(root, orig_ref);
- return -ENOMEM;
- }
-
- ref->nritems = orig_ref->nritems;
- memcpy(ref->extents, orig_ref->extents,
- sizeof(ref->extents[0]) * ref->nritems);
-
- btrfs_free_leaf_ref(root, orig_ref);
-
- ref->root_gen = trans->transid;
- ref->bytenr = buf->start;
- ref->owner = btrfs_header_owner(buf);
- ref->generation = btrfs_header_generation(buf);
-
- ret = btrfs_add_leaf_ref(root, ref, 0);
- WARN_ON(ret);
- btrfs_free_leaf_ref(root, ref);
- }
- return 0;
-}
-
-static noinline int invalidate_extent_cache(struct btrfs_root *root,
- struct extent_buffer *leaf,
- struct btrfs_block_group_cache *group,
- struct btrfs_root *target_root)
-{
- struct btrfs_key key;
- struct inode *inode = NULL;
- struct btrfs_file_extent_item *fi;
- struct extent_state *cached_state = NULL;
- u64 num_bytes;
- u64 skip_objectid = 0;
- u32 nritems;
- u32 i;
-
- nritems = btrfs_header_nritems(leaf);
- for (i = 0; i < nritems; i++) {
- btrfs_item_key_to_cpu(leaf, &key, i);
- if (key.objectid == skip_objectid ||
- key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- continue;
- if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
- continue;
- if (!inode || inode->i_ino != key.objectid) {
- iput(inode);
- inode = btrfs_ilookup(target_root->fs_info->sb,
- key.objectid, target_root, 1);
- }
- if (!inode) {
- skip_objectid = key.objectid;
- continue;
- }
- num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-
- lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
- key.offset + num_bytes - 1, 0, &cached_state,
- GFP_NOFS);
- btrfs_drop_extent_cache(inode, key.offset,
- key.offset + num_bytes - 1, 1);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
- key.offset + num_bytes - 1, &cached_state,
- GFP_NOFS);
- cond_resched();
- }
- iput(inode);
- return 0;
-}
-
-static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *leaf,
- struct btrfs_block_group_cache *group,
- struct inode *reloc_inode)
-{
- struct btrfs_key key;
- struct btrfs_key extent_key;
- struct btrfs_file_extent_item *fi;
- struct btrfs_leaf_ref *ref;
- struct disk_extent *new_extent;
- u64 bytenr;
- u64 num_bytes;
- u32 nritems;
- u32 i;
- int ext_index;
- int nr_extent;
- int ret;
-
- new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
- BUG_ON(!new_extent);
-
- ref = btrfs_lookup_leaf_ref(root, leaf->start);
- BUG_ON(!ref);
-
- ext_index = -1;
- nritems = btrfs_header_nritems(leaf);
- for (i = 0; i < nritems; i++) {
- btrfs_item_key_to_cpu(leaf, &key, i);
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
- continue;
- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- continue;
- bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
- if (bytenr == 0)
- continue;
-
- ext_index++;
- if (bytenr >= group->key.objectid + group->key.offset ||
- bytenr + num_bytes <= group->key.objectid)
- continue;
-
- extent_key.objectid = bytenr;
- extent_key.offset = num_bytes;
- extent_key.type = BTRFS_EXTENT_ITEM_KEY;
- nr_extent = 1;
- ret = get_new_locations(reloc_inode, &extent_key,
- group->key.objectid, 1,
- &new_extent, &nr_extent);
- if (ret > 0)
- continue;
- BUG_ON(ret < 0);
-
- BUG_ON(ref->extents[ext_index].bytenr != bytenr);
- BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
- ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
- ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
-
- btrfs_set_file_extent_disk_bytenr(leaf, fi,
- new_extent->disk_bytenr);
- btrfs_set_file_extent_disk_num_bytes(leaf, fi,
- new_extent->disk_num_bytes);
- btrfs_mark_buffer_dirty(leaf);
-
- ret = btrfs_inc_extent_ref(trans, root,
- new_extent->disk_bytenr,
- new_extent->disk_num_bytes,
- leaf->start,
- root->root_key.objectid,
- trans->transid, key.objectid);
- BUG_ON(ret);
-
- ret = btrfs_free_extent(trans, root,
- bytenr, num_bytes, leaf->start,
- btrfs_header_owner(leaf),
- btrfs_header_generation(leaf),
- key.objectid, 0);
- BUG_ON(ret);
- cond_resched();
- }
- kfree(new_extent);
- BUG_ON(ext_index + 1 != ref->nritems);
- btrfs_free_leaf_ref(root, ref);
- return 0;
-}
-
-int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- struct btrfs_root *reloc_root;
- int ret;
-
- if (root->reloc_root) {
- reloc_root = root->reloc_root;
- root->reloc_root = NULL;
- list_add(&reloc_root->dead_list,
- &root->fs_info->dead_reloc_roots);
-
- btrfs_set_root_bytenr(&reloc_root->root_item,
- reloc_root->node->start);
- btrfs_set_root_level(&root->root_item,
- btrfs_header_level(reloc_root->node));
- memset(&reloc_root->root_item.drop_progress, 0,
- sizeof(struct btrfs_disk_key));
- reloc_root->root_item.drop_level = 0;
-
- ret = btrfs_update_root(trans, root->fs_info->tree_root,
- &reloc_root->root_key,
- &reloc_root->root_item);
- BUG_ON(ret);
- }
- return 0;
-}
-
-int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
-{
- struct btrfs_trans_handle *trans;
- struct btrfs_root *reloc_root;
- struct btrfs_root *prev_root = NULL;
- struct list_head dead_roots;
- int ret;
- unsigned long nr;
-
- INIT_LIST_HEAD(&dead_roots);
- list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
-
- while (!list_empty(&dead_roots)) {
- reloc_root = list_entry(dead_roots.prev,
- struct btrfs_root, dead_list);
- list_del_init(&reloc_root->dead_list);
-
- BUG_ON(reloc_root->commit_root != NULL);
- while (1) {
- trans = btrfs_join_transaction(root, 1);
- BUG_ON(IS_ERR(trans));
-
- mutex_lock(&root->fs_info->drop_mutex);
- ret = btrfs_drop_snapshot(trans, reloc_root);
- if (ret != -EAGAIN)
- break;
- mutex_unlock(&root->fs_info->drop_mutex);
-
- nr = trans->blocks_used;
- ret = btrfs_end_transaction(trans, root);
- BUG_ON(ret);
- btrfs_btree_balance_dirty(root, nr);
- }
-
- free_extent_buffer(reloc_root->node);
-
- ret = btrfs_del_root(trans, root->fs_info->tree_root,
- &reloc_root->root_key);
- BUG_ON(ret);
- mutex_unlock(&root->fs_info->drop_mutex);
-
- nr = trans->blocks_used;
- ret = btrfs_end_transaction(trans, root);
- BUG_ON(ret);
- btrfs_btree_balance_dirty(root, nr);
-
- kfree(prev_root);
- prev_root = reloc_root;
- }
- if (prev_root) {
- btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
- kfree(prev_root);
- }
- return 0;
-}
-
-int btrfs_add_dead_reloc_root(struct btrfs_root *root)
-{
- list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
- return 0;
-}
-
-int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
-{
- struct btrfs_root *reloc_root;
- struct btrfs_trans_handle *trans;
- struct btrfs_key location;
- int found;
- int ret;
-
- mutex_lock(&root->fs_info->tree_reloc_mutex);
- ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
- BUG_ON(ret);
- found = !list_empty(&root->fs_info->dead_reloc_roots);
- mutex_unlock(&root->fs_info->tree_reloc_mutex);
-
- if (found) {
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(IS_ERR(trans));
- ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
- }
-
- location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
- location.offset = (u64)-1;
- location.type = BTRFS_ROOT_ITEM_KEY;
-
- reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
- BUG_ON(!reloc_root);
- btrfs_orphan_cleanup(reloc_root);
- return 0;
-}
-
-static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- struct btrfs_root *reloc_root;
- struct extent_buffer *eb;
- struct btrfs_root_item *root_item;
- struct btrfs_key root_key;
- int ret;
-
- BUG_ON(!root->ref_cows);
- if (root->reloc_root)
- return 0;
-
- root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
- BUG_ON(!root_item);
-
- ret = btrfs_copy_root(trans, root, root->commit_root,
- &eb, BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
-
- root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
- root_key.offset = root->root_key.objectid;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
-
- memcpy(root_item, &root->root_item, sizeof(root_item));
- btrfs_set_root_refs(root_item, 0);
- btrfs_set_root_bytenr(root_item, eb->start);
- btrfs_set_root_level(root_item, btrfs_header_level(eb));
- btrfs_set_root_generation(root_item, trans->transid);
-
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
-
- ret = btrfs_insert_root(trans, root->fs_info->tree_root,
- &root_key, root_item);
- BUG_ON(ret);
- kfree(root_item);
-
- reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
- &root_key);
- BUG_ON(!reloc_root);
- reloc_root->last_trans = trans->transid;
- reloc_root->commit_root = NULL;
- reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
-
- root->reloc_root = reloc_root;
- return 0;
-}
-
-/*
- * Core function of space balance.
- *
- * The idea is using reloc trees to relocate tree blocks in reference
- * counted roots. There is one reloc tree for each subvol, and all
- * reloc trees share same root key objectid. Reloc trees are snapshots
- * of the latest committed roots of subvols (root->commit_root).
- *
- * To relocate a tree block referenced by a subvol, there are two steps.
- * COW the block through subvol's reloc tree, then update block pointer
- * in the subvol to point to the new block. Since all reloc trees share
- * same root key objectid, doing special handing for tree blocks owned
- * by them is easy. Once a tree block has been COWed in one reloc tree,
- * we can use the resulting new block directly when the same block is
- * required to COW again through other reloc trees. By this way, relocated
- * tree blocks are shared between reloc trees, so they are also shared
- * between subvols.
- */
-static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *first_key,
- struct btrfs_ref_path *ref_path,
- struct btrfs_block_group_cache *group,
- struct inode *reloc_inode)
-{
- struct btrfs_root *reloc_root;
- struct extent_buffer *eb = NULL;
- struct btrfs_key *keys;
- u64 *nodes;
- int level;
- int shared_level;
- int lowest_level = 0;
- int ret;
-
- if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
- lowest_level = ref_path->owner_objectid;
-
- if (!root->ref_cows) {
- path->lowest_level = lowest_level;
- ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
- BUG_ON(ret < 0);
- path->lowest_level = 0;
- btrfs_release_path(root, path);
- return 0;
- }
-
- mutex_lock(&root->fs_info->tree_reloc_mutex);
- ret = init_reloc_tree(trans, root);
- BUG_ON(ret);
- reloc_root = root->reloc_root;
-
- shared_level = ref_path->shared_level;
- ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
-
- keys = ref_path->node_keys;
- nodes = ref_path->new_nodes;
- memset(&keys[shared_level + 1], 0,
- sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
- memset(&nodes[shared_level + 1], 0,
- sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
-
- if (nodes[lowest_level] == 0) {
- path->lowest_level = lowest_level;
- ret = btrfs_search_slot(trans, reloc_root, first_key, path,
- 0, 1);
- BUG_ON(ret);
- for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
- eb = path->nodes[level];
- if (!eb || eb == reloc_root->node)
- break;
- nodes[level] = eb->start;
- if (level == 0)
- btrfs_item_key_to_cpu(eb, &keys[level], 0);
- else
- btrfs_node_key_to_cpu(eb, &keys[level], 0);
- }
- if (nodes[0] &&
- ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
- eb = path->nodes[0];
- ret = replace_extents_in_leaf(trans, reloc_root, eb,
- group, reloc_inode);
- BUG_ON(ret);
- }
- btrfs_release_path(reloc_root, path);
- } else {
- ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
- lowest_level);
- BUG_ON(ret);
- }
-
- /*
- * replace tree blocks in the fs tree with tree blocks in
- * the reloc tree.
- */
- ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
- BUG_ON(ret < 0);
-
- if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
- ret = btrfs_search_slot(trans, reloc_root, first_key, path,
- 0, 0);
- BUG_ON(ret);
- extent_buffer_get(path->nodes[0]);
- eb = path->nodes[0];
- btrfs_release_path(reloc_root, path);
- ret = invalidate_extent_cache(reloc_root, eb, group, root);
- BUG_ON(ret);
- free_extent_buffer(eb);
- }
-
- mutex_unlock(&root->fs_info->tree_reloc_mutex);
- path->lowest_level = 0;
- return 0;
-}
-
-static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *first_key,
- struct btrfs_ref_path *ref_path)
-{
- int ret;
-
- ret = relocate_one_path(trans, root, path, first_key,
- ref_path, NULL, NULL);
- BUG_ON(ret);
-
- return 0;
-}
-
-static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_path *path,
- struct btrfs_key *extent_key)
-{
- int ret;
-
- ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
- if (ret)
- goto out;
- ret = btrfs_del_item(trans, extent_root, path);
-out:
- btrfs_release_path(extent_root, path);
- return ret;
-}
-
-static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
- struct btrfs_ref_path *ref_path)
-{
- struct btrfs_key root_key;
-
- root_key.objectid = ref_path->root_objectid;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- if (is_cowonly_root(ref_path->root_objectid))
- root_key.offset = 0;
- else
- root_key.offset = (u64)-1;
-
- return btrfs_read_fs_root_no_name(fs_info, &root_key);
-}
-
-static noinline int relocate_one_extent(struct btrfs_root *extent_root,
- struct btrfs_path *path,
- struct btrfs_key *extent_key,
- struct btrfs_block_group_cache *group,
- struct inode *reloc_inode, int pass)
-{
- struct btrfs_trans_handle *trans;
- struct btrfs_root *found_root;
- struct btrfs_ref_path *ref_path = NULL;
- struct disk_extent *new_extents = NULL;
- int nr_extents = 0;
- int loops;
- int ret;
- int level;
- struct btrfs_key first_key;
- u64 prev_block = 0;
-
-
- trans = btrfs_start_transaction(extent_root, 1);
- BUG_ON(IS_ERR(trans));
-
- if (extent_key->objectid == 0) {
- ret = del_extent_zero(trans, extent_root, path, extent_key);
- goto out;
- }
-
- ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
- if (!ref_path) {
- ret = -ENOMEM;
- goto out;
- }
-
- for (loops = 0; ; loops++) {
- if (loops == 0) {
- ret = btrfs_first_ref_path(trans, extent_root, ref_path,
- extent_key->objectid);
- } else {
- ret = btrfs_next_ref_path(trans, extent_root, ref_path);
- }
- if (ret < 0)
- goto out;
- if (ret > 0)
- break;
-
- if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
- ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
- continue;
-
- found_root = read_ref_root(extent_root->fs_info, ref_path);
- BUG_ON(!found_root);
- /*
- * for reference counted tree, only process reference paths
- * rooted at the latest committed root.
- */
- if (found_root->ref_cows &&
- ref_path->root_generation != found_root->root_key.offset)
- continue;
-
- if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
- if (pass == 0) {
- /*
- * copy data extents to new locations
- */
- u64 group_start = group->key.objectid;
- ret = relocate_data_extent(reloc_inode,
- extent_key,
- group_start);
- if (ret < 0)
- goto out;
- break;
- }
- level = 0;
- } else {
- level = ref_path->owner_objectid;
- }
-
- if (prev_block != ref_path->nodes[level]) {
- struct extent_buffer *eb;
- u64 block_start = ref_path->nodes[level];
- u64 block_size = btrfs_level_size(found_root, level);
-
- eb = read_tree_block(found_root, block_start,
- block_size, 0);
- btrfs_tree_lock(eb);
- BUG_ON(level != btrfs_header_level(eb));
-
- if (level == 0)
- btrfs_item_key_to_cpu(eb, &first_key, 0);
- else
- btrfs_node_key_to_cpu(eb, &first_key, 0);
-
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
- prev_block = block_start;
- }
-
- mutex_lock(&extent_root->fs_info->trans_mutex);
- btrfs_record_root_in_trans(found_root);
- mutex_unlock(&extent_root->fs_info->trans_mutex);
- if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
- /*
- * try to update data extent references while
- * keeping metadata shared between snapshots.
- */
- if (pass == 1) {
- ret = relocate_one_path(trans, found_root,
- path, &first_key, ref_path,
- group, reloc_inode);
- if (ret < 0)
- goto out;
- continue;
- }
- /*
- * use fallback method to process the remaining
- * references.
- */
- if (!new_extents) {
- u64 group_start = group->key.objectid;
- new_extents = kmalloc(sizeof(*new_extents),
- GFP_NOFS);
- nr_extents = 1;
- ret = get_new_locations(reloc_inode,
- extent_key,
- group_start, 1,
- &new_extents,
- &nr_extents);
- if (ret)
- goto out;
- }
- ret = replace_one_extent(trans, found_root,
- path, extent_key,
- &first_key, ref_path,
- new_extents, nr_extents);
- } else {
- ret = relocate_tree_block(trans, found_root, path,
- &first_key, ref_path);
- }
- if (ret < 0)
- goto out;
- }
- ret = 0;
-out:
- btrfs_end_transaction(trans, extent_root);
- kfree(new_extents);
- kfree(ref_path);
- return ret;
-}
-#endif
-
static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
{
u64 num_devices;
@@ -8050,13 +6531,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
alloc_flags = update_block_group_flags(root, cache->flags);
if (alloc_flags != cache->flags)
- do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+ do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ CHUNK_ALLOC_FORCE);
ret = set_block_group_ro(cache);
if (!ret)
goto out;
alloc_flags = get_alloc_profile(root, cache->space_info->flags);
- ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+ ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
ret = set_block_group_ro(cache);
@@ -8065,6 +6548,14 @@ out:
return ret;
}
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 type)
+{
+ u64 alloc_flags = get_alloc_profile(root, type);
+ return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+ CHUNK_ALLOC_FORCE);
+}
+
/*
* helper to account the unused space of all the readonly block group in the
* list. takes mirrors into account.
@@ -8414,10 +6905,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
ret = -ENOMEM;
goto error;
}
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ ret = -ENOMEM;
+ goto error;
+ }
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
- spin_lock_init(&cache->tree_lock);
cache->fs_info = info;
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
@@ -8425,24 +6922,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
if (need_clear)
cache->disk_cache_state = BTRFS_DC_CLEAR;
- /*
- * we only want to have 32k of ram per block group for keeping
- * track of free space, and if we pass 1/2 of that we want to
- * start converting things over to using bitmaps
- */
- cache->extents_thresh = ((1024 * 32) / 2) /
- sizeof(struct btrfs_free_space);
-
read_extent_buffer(leaf, &cache->item,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(cache->item));
memcpy(&cache->key, &found_key, sizeof(found_key));
key.objectid = found_key.objectid + found_key.offset;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
cache->flags = btrfs_block_group_flags(&cache->item);
cache->sectorsize = root->sectorsize;
+ btrfs_init_free_space_ctl(cache);
+
/*
* We need to exclude the super stripes now so that the space
* info has super bytes accounted for, otherwise we'll think
@@ -8529,6 +7020,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache)
return -ENOMEM;
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return -ENOMEM;
+ }
cache->key.objectid = chunk_offset;
cache->key.offset = size;
@@ -8536,19 +7033,13 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->sectorsize = root->sectorsize;
cache->fs_info = root->fs_info;
- /*
- * we only want to have 32k of ram per block group for keeping track
- * of free space, and if we pass 1/2 of that we want to start
- * converting things over to using bitmaps
- */
- cache->extents_thresh = ((1024 * 32) / 2) /
- sizeof(struct btrfs_free_space);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
- spin_lock_init(&cache->tree_lock);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
+ btrfs_init_free_space_ctl(cache);
+
btrfs_set_block_group_used(&cache->item, bytes_used);
btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
cache->flags = type;
@@ -8603,6 +7094,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
BUG_ON(!block_group);
BUG_ON(!block_group->ro);
+ /*
+ * Free the reserved super bytes from this block group before
+ * remove it.
+ */
+ free_excluded_extents(root, block_group);
+
memcpy(&key, &block_group->key, sizeof(key));
if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
BTRFS_BLOCK_GROUP_RAID1 |
@@ -8655,12 +7152,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
if (ret > 0)
- btrfs_release_path(tree_root, path);
+ btrfs_release_path(path);
if (ret == 0) {
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
goto out;
- btrfs_release_path(tree_root, path);
+ btrfs_release_path(path);
}
spin_lock(&root->fs_info->block_group_cache_lock);
@@ -8706,13 +7203,99 @@ out:
return ret;
}
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+ struct btrfs_super_block *disk_super;
+ u64 features;
+ u64 flags;
+ int mixed = 0;
+ int ret;
+
+ disk_super = &fs_info->super_copy;
+ if (!btrfs_super_root(disk_super))
+ return 1;
+
+ features = btrfs_super_incompat_flags(disk_super);
+ if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+ mixed = 1;
+
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ if (ret)
+ goto out;
+
+ if (mixed) {
+ flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
+ ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ } else {
+ flags = BTRFS_BLOCK_GROUP_METADATA;
+ ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ if (ret)
+ goto out;
+
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ }
+out:
+ return ret;
+}
+
int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
return unpin_extent_range(root, start, end);
}
int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
- u64 num_bytes)
+ u64 num_bytes, u64 *actual_bytes)
{
- return btrfs_discard_extent(root, bytenr, num_bytes);
+ return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
+}
+
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache = NULL;
+ u64 group_trimmed;
+ u64 start;
+ u64 end;
+ u64 trimmed = 0;
+ int ret = 0;
+
+ cache = btrfs_lookup_block_group(fs_info, range->start);
+
+ while (cache) {
+ if (cache->key.objectid >= (range->start + range->len)) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+
+ start = max(range->start, cache->key.objectid);
+ end = min(range->start + range->len,
+ cache->key.objectid + cache->key.offset);
+
+ if (end - start >= range->minlen) {
+ if (!block_group_cache_done(cache)) {
+ ret = cache_block_group(cache, NULL, root, 0);
+ if (!ret)
+ wait_block_group_cache_done(cache);
+ }
+ ret = btrfs_trim_block_group(cache,
+ &group_trimmed,
+ start,
+ end,
+ range->minlen);
+
+ trimmed += group_trimmed;
+ if (ret) {
+ btrfs_put_block_group(cache);
+ break;
+ }
+ }
+
+ cache = next_block_group(fs_info->tree_root, cache);
+ }
+
+ range->len = trimmed;
+ return ret;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5e76a474cb7e..c5d9fbb92bc3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -10,6 +10,8 @@
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
+#include <linux/prefetch.h>
+#include <linux/cleancache.h>
#include "extent_io.h"
#include "extent_map.h"
#include "compat.h"
@@ -101,7 +103,7 @@ void extent_io_exit(void)
}
void extent_io_tree_init(struct extent_io_tree *tree,
- struct address_space *mapping, gfp_t mask)
+ struct address_space *mapping)
{
tree->state = RB_ROOT;
INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
@@ -439,6 +441,15 @@ static int clear_state_bit(struct extent_io_tree *tree,
return ret;
}
+static struct extent_state *
+alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+ if (!prealloc)
+ prealloc = alloc_extent_state(GFP_ATOMIC);
+
+ return prealloc;
+}
+
/*
* clear some bits on a range in the tree. This may require splitting
* or inserting elements in the tree, so the gfp mask is used to
@@ -529,8 +540,8 @@ hit_next:
*/
if (state->start < start) {
- if (!prealloc)
- prealloc = alloc_extent_state(GFP_ATOMIC);
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
BUG_ON(err == -EEXIST);
prealloc = NULL;
@@ -551,8 +562,8 @@ hit_next:
* on the first half
*/
if (state->start <= end && state->end > end) {
- if (!prealloc)
- prealloc = alloc_extent_state(GFP_ATOMIC);
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
if (wake)
@@ -690,6 +701,15 @@ static void cache_state(struct extent_state *state,
}
}
+static void uncache_state(struct extent_state **cached_ptr)
+{
+ if (cached_ptr && (*cached_ptr)) {
+ struct extent_state *state = *cached_ptr;
+ *cached_ptr = NULL;
+ free_extent_state(state);
+ }
+}
+
/*
* set some bits on a range in the tree. This may require allocations or
* sleeping, so the gfp mask is used to indicate what is allowed.
@@ -716,8 +736,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
- if (!prealloc)
- return -ENOMEM;
+ BUG_ON(!prealloc);
}
spin_lock(&tree->lock);
@@ -734,6 +753,8 @@ again:
*/
node = tree_search(tree, start);
if (!node) {
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
err = insert_state(tree, prealloc, start, end, &bits);
prealloc = NULL;
BUG_ON(err == -EEXIST);
@@ -762,20 +783,18 @@ hit_next:
if (err)
goto out;
+ next_node = rb_next(node);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
- if (start < end && prealloc && !need_resched()) {
- next_node = rb_next(node);
- if (next_node) {
- state = rb_entry(next_node, struct extent_state,
- rb_node);
- if (state->start == start)
- goto hit_next;
- }
+ if (next_node && start < end && prealloc && !need_resched()) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
}
goto search_again;
}
@@ -802,6 +821,9 @@ hit_next:
err = -EEXIST;
goto out;
}
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, start);
BUG_ON(err == -EEXIST);
prealloc = NULL;
@@ -832,14 +854,25 @@ hit_next:
this_end = end;
else
this_end = last_start - 1;
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
+
+ /*
+ * Avoid to free 'prealloc' if it can be merged with
+ * the later extent.
+ */
+ atomic_inc(&prealloc->refs);
err = insert_state(tree, prealloc, start, this_end,
&bits);
BUG_ON(err == -EEXIST);
if (err) {
+ free_extent_state(prealloc);
prealloc = NULL;
goto out;
}
cache_state(prealloc, cached_state);
+ free_extent_state(prealloc);
prealloc = NULL;
start = this_end + 1;
goto search_again;
@@ -856,6 +889,9 @@ hit_next:
err = -EEXIST;
goto out;
}
+
+ prealloc = alloc_extent_state_atomic(prealloc);
+ BUG_ON(!prealloc);
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
@@ -932,18 +968,11 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
NULL, mask);
}
-static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
- NULL, mask);
-}
-
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
+ struct extent_state **cached_state, gfp_t mask)
{
- return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
- NULL, mask);
+ return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
+ NULL, cached_state, mask);
}
static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -954,11 +983,6 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
cached_state, mask);
}
-int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
-{
- return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
-}
-
/*
* either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
@@ -1012,33 +1036,13 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
mask);
}
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
mask);
}
/*
- * helper function to set pages and extents in the tree dirty
- */
-int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
-{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = end >> PAGE_CACHE_SHIFT;
- struct page *page;
-
- while (index <= end_index) {
- page = find_get_page(tree->mapping, index);
- BUG_ON(!page);
- __set_page_dirty_nobuffers(page);
- page_cache_release(page);
- index++;
- }
- return 0;
-}
-
-/*
* helper function to set both pages and extents in the tree writeback
*/
static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1433,12 +1437,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- unsigned long bits)
+ unsigned long bits, int contig)
{
struct rb_node *node;
struct extent_state *state;
u64 cur_start = *start;
u64 total_bytes = 0;
+ u64 last = 0;
int found = 0;
if (search_end <= cur_start) {
@@ -1463,7 +1468,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
state = rb_entry(node, struct extent_state, rb_node);
if (state->start > search_end)
break;
- if (state->end >= cur_start && (state->state & bits)) {
+ if (contig && found && state->start > last + 1)
+ break;
+ if (state->end >= cur_start && (state->state & bits) == bits) {
total_bytes += min(search_end, state->end) + 1 -
max(cur_start, state->start);
if (total_bytes >= max_bytes)
@@ -1472,6 +1479,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
*start = state->start;
found = 1;
}
+ last = state->end;
+ } else if (contig && found) {
+ break;
}
node = rb_next(node);
if (!node)
@@ -1729,6 +1739,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
do {
struct page *page = bvec->bv_page;
+ struct extent_state *cached = NULL;
+ struct extent_state *state;
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1743,9 +1756,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
if (++bvec <= bvec_end)
prefetchw(&bvec->bv_page->flags);
+ spin_lock(&tree->lock);
+ state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
+ if (state && state->start == start) {
+ /*
+ * take a reference on the state, unlock will drop
+ * the ref
+ */
+ cache_state(state, &cached);
+ }
+ spin_unlock(&tree->lock);
+
if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
ret = tree->ops->readpage_end_io_hook(page, start, end,
- NULL);
+ state);
if (ret)
uptodate = 0;
}
@@ -1758,15 +1782,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
test_bit(BIO_UPTODATE, &bio->bi_flags);
if (err)
uptodate = 0;
+ uncache_state(&cached);
continue;
}
}
if (uptodate) {
- set_extent_uptodate(tree, start, end,
+ set_extent_uptodate(tree, start, end, &cached,
GFP_ATOMIC);
}
- unlock_extent(tree, start, end, GFP_ATOMIC);
+ unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
if (whole_page) {
if (uptodate) {
@@ -1790,44 +1815,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
bio_put(bio);
}
-/*
- * IO done from prepare_write is pretty simple, we just unlock
- * the structs in the extent tree when done, and set the uptodate bits
- * as appropriate.
- */
-static void end_bio_extent_preparewrite(struct bio *bio, int err)
-{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct extent_io_tree *tree;
- u64 start;
- u64 end;
-
- do {
- struct page *page = bvec->bv_page;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
-
- start = ((u64)page->index << PAGE_CACHE_SHIFT) +
- bvec->bv_offset;
- end = start + bvec->bv_len - 1;
-
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
- if (uptodate) {
- set_extent_uptodate(tree, start, end, GFP_ATOMIC);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
-
- unlock_extent(tree, start, end, GFP_ATOMIC);
-
- } while (bvec >= bio->bi_io_vec);
-
- bio_put(bio);
-}
-
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
gfp_t gfp_flags)
@@ -1946,6 +1933,7 @@ void set_page_extent_mapped(struct page *page)
static void set_page_extent_head(struct page *page, unsigned long len)
{
+ WARN_ON(!PagePrivate(page));
set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
}
@@ -1975,7 +1963,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
struct btrfs_ordered_extent *ordered;
int ret;
int nr = 0;
- size_t page_offset = 0;
+ size_t pg_offset = 0;
size_t iosize;
size_t disk_io_size;
size_t blocksize = inode->i_sb->s_blocksize;
@@ -1983,6 +1971,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
set_page_extent_mapped(page);
+ if (!PageUptodate(page)) {
+ if (cleancache_get_page(page) == 0) {
+ BUG_ON(blocksize != PAGE_SIZE);
+ goto out;
+ }
+ }
+
end = page_end;
while (1) {
lock_extent(tree, start, end, GFP_NOFS);
@@ -2009,19 +2004,22 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
while (cur <= end) {
if (cur >= last_byte) {
char *userpage;
- iosize = PAGE_CACHE_SIZE - page_offset;
+ struct extent_state *cached = NULL;
+
+ iosize = PAGE_CACHE_SIZE - pg_offset;
userpage = kmap_atomic(page, KM_USER0);
- memset(userpage + page_offset, 0, iosize);
+ memset(userpage + pg_offset, 0, iosize);
flush_dcache_page(page);
kunmap_atomic(userpage, KM_USER0);
set_extent_uptodate(tree, cur, cur + iosize - 1,
- GFP_NOFS);
- unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ &cached, GFP_NOFS);
+ unlock_extent_cached(tree, cur, cur + iosize - 1,
+ &cached, GFP_NOFS);
break;
}
- em = get_extent(inode, page, page_offset, cur,
+ em = get_extent(inode, page, pg_offset, cur,
end - cur + 1, 0);
- if (IS_ERR(em) || !em) {
+ if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
unlock_extent(tree, cur, end, GFP_NOFS);
break;
@@ -2056,16 +2054,19 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
char *userpage;
+ struct extent_state *cached = NULL;
+
userpage = kmap_atomic(page, KM_USER0);
- memset(userpage + page_offset, 0, iosize);
+ memset(userpage + pg_offset, 0, iosize);
flush_dcache_page(page);
kunmap_atomic(userpage, KM_USER0);
set_extent_uptodate(tree, cur, cur + iosize - 1,
- GFP_NOFS);
- unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+ &cached, GFP_NOFS);
+ unlock_extent_cached(tree, cur, cur + iosize - 1,
+ &cached, GFP_NOFS);
cur = cur + iosize;
- page_offset += iosize;
+ pg_offset += iosize;
continue;
}
/* the get_extent function already copied into the page */
@@ -2074,7 +2075,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
cur = cur + iosize;
- page_offset += iosize;
+ pg_offset += iosize;
continue;
}
/* we have an inline extent but it didn't get marked up
@@ -2084,7 +2085,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
cur = cur + iosize;
- page_offset += iosize;
+ pg_offset += iosize;
continue;
}
@@ -2097,7 +2098,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
pnr -= page->index;
ret = submit_extent_page(READ, tree, page,
- sector, disk_io_size, page_offset,
+ sector, disk_io_size, pg_offset,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
*bio_flags,
@@ -2108,8 +2109,9 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
if (ret)
SetPageError(page);
cur = cur + iosize;
- page_offset += iosize;
+ pg_offset += iosize;
}
+out:
if (!nr) {
if (!PageError(page))
SetPageUptodate(page);
@@ -2181,10 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
unsigned long nr_written = 0;
if (wbc->sync_mode == WB_SYNC_ALL)
- write_flags = WRITE_SYNC_PLUG;
+ write_flags = WRITE_SYNC;
else
write_flags = WRITE;
+ trace___extent_writepage(page, inode, wbc);
+
WARN_ON(!PageLocked(page));
pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index ||
@@ -2301,7 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
em = epd->get_extent(inode, page, pg_offset, cur,
end - cur + 1, 1);
- if (IS_ERR(em) || !em) {
+ if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
break;
}
@@ -2641,7 +2645,7 @@ int extent_readpages(struct extent_io_tree *tree,
prefetchw(&page->flags);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping,
- page->index, GFP_KERNEL)) {
+ page->index, GFP_NOFS)) {
__extent_read_full_page(tree, page, get_extent,
&bio, 0, &bio_flags);
}
@@ -2680,125 +2684,6 @@ int extent_invalidatepage(struct extent_io_tree *tree,
}
/*
- * simple commit_write call, set_range_dirty is used to mark both
- * the pages and the extent records as dirty
- */
-int extent_commit_write(struct extent_io_tree *tree,
- struct inode *inode, struct page *page,
- unsigned from, unsigned to)
-{
- loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-
- set_page_extent_mapped(page);
- set_page_dirty(page);
-
- if (pos > inode->i_size) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
- return 0;
-}
-
-int extent_prepare_write(struct extent_io_tree *tree,
- struct inode *inode, struct page *page,
- unsigned from, unsigned to, get_extent_t *get_extent)
-{
- u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
- u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
- u64 block_start;
- u64 orig_block_start;
- u64 block_end;
- u64 cur_end;
- struct extent_map *em;
- unsigned blocksize = 1 << inode->i_blkbits;
- size_t page_offset = 0;
- size_t block_off_start;
- size_t block_off_end;
- int err = 0;
- int iocount = 0;
- int ret = 0;
- int isnew;
-
- set_page_extent_mapped(page);
-
- block_start = (page_start + from) & ~((u64)blocksize - 1);
- block_end = (page_start + to - 1) | (blocksize - 1);
- orig_block_start = block_start;
-
- lock_extent(tree, page_start, page_end, GFP_NOFS);
- while (block_start <= block_end) {
- em = get_extent(inode, page, page_offset, block_start,
- block_end - block_start + 1, 1);
- if (IS_ERR(em) || !em)
- goto err;
-
- cur_end = min(block_end, extent_map_end(em) - 1);
- block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
- block_off_end = block_off_start + blocksize;
- isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
-
- if (!PageUptodate(page) && isnew &&
- (block_off_end > to || block_off_start < from)) {
- void *kaddr;
-
- kaddr = kmap_atomic(page, KM_USER0);
- if (block_off_end > to)
- memset(kaddr + to, 0, block_off_end - to);
- if (block_off_start < from)
- memset(kaddr + block_off_start, 0,
- from - block_off_start);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
- }
- if ((em->block_start != EXTENT_MAP_HOLE &&
- em->block_start != EXTENT_MAP_INLINE) &&
- !isnew && !PageUptodate(page) &&
- (block_off_end > to || block_off_start < from) &&
- !test_range_bit(tree, block_start, cur_end,
- EXTENT_UPTODATE, 1, NULL)) {
- u64 sector;
- u64 extent_offset = block_start - em->start;
- size_t iosize;
- sector = (em->block_start + extent_offset) >> 9;
- iosize = (cur_end - block_start + blocksize) &
- ~((u64)blocksize - 1);
- /*
- * we've already got the extent locked, but we
- * need to split the state such that our end_bio
- * handler can clear the lock.
- */
- set_extent_bit(tree, block_start,
- block_start + iosize - 1,
- EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
- ret = submit_extent_page(READ, tree, page,
- sector, iosize, page_offset, em->bdev,
- NULL, 1,
- end_bio_extent_preparewrite, 0,
- 0, 0);
- if (ret && !err)
- err = ret;
- iocount++;
- block_start = block_start + iosize;
- } else {
- set_extent_uptodate(tree, block_start, cur_end,
- GFP_NOFS);
- unlock_extent(tree, block_start, cur_end, GFP_NOFS);
- block_start = cur_end + 1;
- }
- page_offset = block_start & (PAGE_CACHE_SIZE - 1);
- free_extent_map(em);
- }
- if (iocount) {
- wait_extent_bit(tree, orig_block_start,
- block_end, EXTENT_LOCKED);
- }
- check_page_uptodate(tree, page);
-err:
- /* FIXME, zero out newly allocated blocks on error */
- return err;
-}
-
-/*
* a helper for releasepage, this tests for areas of the page that
* are locked or under IO and drops the related state bits if it is safe
* to drop the page.
@@ -2821,9 +2706,17 @@ int try_release_extent_state(struct extent_map_tree *map,
* at this point we can safely clear everything except the
* locked bit and the nodatasum bit
*/
- clear_extent_bit(tree, start, end,
+ ret = clear_extent_bit(tree, start, end,
~(EXTENT_LOCKED | EXTENT_NODATASUM),
0, 0, NULL, mask);
+
+ /* if clear_extent_bit failed for enomem reasons,
+ * we can't allow the release to continue.
+ */
+ if (ret < 0)
+ ret = 0;
+ else
+ ret = 1;
}
return ret;
}
@@ -2848,7 +2741,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
len = end - start + 1;
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
- if (!em || IS_ERR(em)) {
+ if (IS_ERR_OR_NULL(em)) {
write_unlock(&map->lock);
break;
}
@@ -2876,31 +2769,44 @@ int try_release_extent_mapping(struct extent_map_tree *map,
return try_release_extent_state(map, tree, page, mask);
}
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
- get_extent_t *get_extent)
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+ u64 offset,
+ u64 last,
+ get_extent_t *get_extent)
{
- struct inode *inode = mapping->host;
- struct extent_state *cached_state = NULL;
- u64 start = iblock << inode->i_blkbits;
- sector_t sector = 0;
- size_t blksize = (1 << inode->i_blkbits);
+ u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
struct extent_map *em;
+ u64 len;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
- 0, &cached_state, GFP_NOFS);
- em = get_extent(inode, NULL, 0, start, blksize, 0);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
- start + blksize - 1, &cached_state, GFP_NOFS);
- if (!em || IS_ERR(em))
- return 0;
+ if (offset >= last)
+ return NULL;
- if (em->block_start > EXTENT_MAP_LAST_BYTE)
- goto out;
+ while(1) {
+ len = last - offset;
+ if (len == 0)
+ break;
+ len = (len + sectorsize - 1) & ~(sectorsize - 1);
+ em = get_extent(inode, NULL, 0, offset, len, 0);
+ if (IS_ERR_OR_NULL(em))
+ return em;
- sector = (em->block_start + start - em->start) >> inode->i_blkbits;
-out:
- free_extent_map(em);
- return sector;
+ /* if this isn't a hole return it */
+ if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+ em->block_start != EXTENT_MAP_HOLE) {
+ return em;
+ }
+
+ /* this is a hole, advance to the next extent */
+ offset = extent_map_end(em);
+ free_extent_map(em);
+ if (offset >= last)
+ break;
+ }
+ return NULL;
}
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -2912,16 +2818,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u32 flags = 0;
u32 found_type;
u64 last;
+ u64 last_for_get_extent = 0;
u64 disko = 0;
+ u64 isize = i_size_read(inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
struct btrfs_file_extent_item *item;
int end = 0;
- u64 em_start = 0, em_len = 0;
+ u64 em_start = 0;
+ u64 em_len = 0;
+ u64 em_end = 0;
unsigned long emflags;
- int hole = 0;
if (len == 0)
return -EINVAL;
@@ -2931,8 +2840,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -ENOMEM;
path->leave_spinning = 1;
+ /*
+ * lookup the last file extent. We're not using i_size here
+ * because there might be preallocation past i_size
+ */
ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
- path, inode->i_ino, -1, 0);
+ path, btrfs_ino(inode), -1, 0);
if (ret < 0) {
btrfs_free_path(path);
return ret;
@@ -2944,18 +2857,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
found_type = btrfs_key_type(&found_key);
- /* No extents, just return */
- if (found_key.objectid != inode->i_ino ||
+ /* No extents, but there might be delalloc bits */
+ if (found_key.objectid != btrfs_ino(inode) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
- btrfs_free_path(path);
- return 0;
+ /* have to trust i_size as the end */
+ last = (u64)-1;
+ last_for_get_extent = isize;
+ } else {
+ /*
+ * remember the start of the last extent. There are a
+ * bunch of different factors that go into the length of the
+ * extent, so its much less complex to remember where it started
+ */
+ last = found_key.offset;
+ last_for_get_extent = last + 1;
}
- last = found_key.offset;
btrfs_free_path(path);
+ /*
+ * we might have some extents allocated but more delalloc past those
+ * extents. so, we trust isize unless the start of the last extent is
+ * beyond isize
+ */
+ if (last < isize) {
+ last = (u64)-1;
+ last_for_get_extent = isize;
+ }
+
lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
&cached_state, GFP_NOFS);
- em = get_extent(inode, NULL, 0, off, max - off, 0);
+
+ em = get_extent_skip_holes(inode, off, last_for_get_extent,
+ get_extent);
if (!em)
goto out;
if (IS_ERR(em)) {
@@ -2964,22 +2897,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
while (!end) {
- hole = 0;
- off = em->start + em->len;
- if (off >= max)
- end = 1;
+ u64 offset_in_extent;
- if (em->block_start == EXTENT_MAP_HOLE) {
- hole = 1;
- goto next;
- }
+ /* break if the extent we found is outside the range */
+ if (em->start >= max || extent_map_end(em) < off)
+ break;
- em_start = em->start;
- em_len = em->len;
+ /*
+ * get_extent may return an extent that starts before our
+ * requested range. We have to make sure the ranges
+ * we return to fiemap always move forward and don't
+ * overlap, so adjust the offsets here
+ */
+ em_start = max(em->start, off);
+ /*
+ * record the offset from the start of the extent
+ * for adjusting the disk offset below
+ */
+ offset_in_extent = em_start - em->start;
+ em_end = extent_map_end(em);
+ em_len = em_end - em_start;
+ emflags = em->flags;
disko = 0;
flags = 0;
+ /*
+ * bump off for our next call to get_extent
+ */
+ off = extent_map_end(em);
+ if (off >= max)
+ end = 1;
+
if (em->block_start == EXTENT_MAP_LAST_BYTE) {
end = 1;
flags |= FIEMAP_EXTENT_LAST;
@@ -2990,42 +2939,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
flags |= (FIEMAP_EXTENT_DELALLOC |
FIEMAP_EXTENT_UNKNOWN);
} else {
- disko = em->block_start;
+ disko = em->block_start + offset_in_extent;
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
-next:
- emflags = em->flags;
free_extent_map(em);
em = NULL;
- if (!end) {
- em = get_extent(inode, NULL, 0, off, max - off, 0);
- if (!em)
- goto out;
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
- }
- emflags = em->flags;
- }
-
- if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
+ if ((em_start >= last) || em_len == (u64)-1 ||
+ (last == (u64)-1 && isize <= em_end)) {
flags |= FIEMAP_EXTENT_LAST;
end = 1;
}
- if (em_start == last) {
+ /* now scan forward to see if this is really the last extent. */
+ em = get_extent_skip_holes(inode, off, last_for_get_extent,
+ get_extent);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out;
+ }
+ if (!em) {
flags |= FIEMAP_EXTENT_LAST;
end = 1;
}
-
- if (!hole) {
- ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
- em_len, flags);
- if (ret)
- goto out_free;
- }
+ ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+ em_len, flags);
+ if (ret)
+ goto out_free;
}
out_free:
free_extent_map(em);
@@ -3140,8 +3081,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len,
- struct page *page0,
- gfp_t mask)
+ struct page *page0)
{
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
@@ -3162,7 +3102,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
}
rcu_read_unlock();
- eb = __alloc_extent_buffer(tree, start, len, mask);
+ eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
if (!eb)
return NULL;
@@ -3179,7 +3119,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
i = 0;
}
for (; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+ p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
if (!p) {
WARN_ON(1);
goto free_eb;
@@ -3194,7 +3134,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
}
if (!PageUptodate(p))
uptodate = 0;
- unlock_page(p);
+
+ /*
+ * see below about how we avoid a nasty race with release page
+ * and why we unlock later
+ */
+ if (i != 0)
+ unlock_page(p);
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3218,9 +3164,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
atomic_inc(&eb->refs);
spin_unlock(&tree->buffer_lock);
radix_tree_preload_end();
+
+ /*
+ * there is a race where release page may have
+ * tried to find this extent buffer in the radix
+ * but failed. It will tell the VM it is safe to
+ * reclaim the, and it will clear the page private bit.
+ * We must make sure to set the page private bit properly
+ * after the extent buffer is in the radix tree so
+ * it doesn't get lost
+ */
+ set_page_extent_mapped(eb->first_page);
+ set_page_extent_head(eb->first_page, eb->len);
+ if (!page0)
+ unlock_page(eb->first_page);
return eb;
free_eb:
+ if (eb->first_page && !page0)
+ unlock_page(eb->first_page);
+
if (!atomic_dec_and_test(&eb->refs))
return exists;
btrfs_release_extent_buffer(eb);
@@ -3228,8 +3191,7 @@ free_eb:
}
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len,
- gfp_t mask)
+ u64 start, unsigned long len)
{
struct extent_buffer *eb;
@@ -3271,10 +3233,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
continue;
lock_page(page);
+ WARN_ON(!PagePrivate(page));
+
+ set_page_extent_mapped(page);
if (i == 0)
set_page_extent_head(page, eb->len);
- else
- set_page_private(page, EXTENT_PAGE_PRIVATE);
clear_page_dirty_for_io(page);
spin_lock_irq(&page->mapping->tree_lock);
@@ -3289,13 +3252,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
return 0;
}
-int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
- struct extent_buffer *eb)
-{
- return wait_on_extent_writeback(tree, eb->start,
- eb->start + eb->len - 1);
-}
-
int set_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb)
{
@@ -3341,7 +3297,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
num_pages = num_extent_pages(eb->start, eb->len);
set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
- GFP_NOFS);
+ NULL, GFP_NOFS);
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3464,6 +3420,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
+
+ WARN_ON(!PagePrivate(page));
+
+ set_page_extent_mapped(page);
+ if (i == 0)
+ set_page_extent_head(page, eb->len);
+
if (inc_all_pages)
page_cache_get(page);
if (!PageUptodate(page)) {
@@ -3569,6 +3532,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
"wanted %lu %lu\n", (unsigned long long)eb->start,
eb->len, start, min_len);
WARN_ON(1);
+ return -EINVAL;
}
p = extent_buffer_page(eb, i);
@@ -3761,6 +3725,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
kunmap_atomic(dst_kaddr, KM_USER0);
}
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+ unsigned long distance = (src > dst) ? src - dst : dst - src;
+ return distance < len;
+}
+
static void copy_pages(struct page *dst_page, struct page *src_page,
unsigned long dst_off, unsigned long src_off,
unsigned long len)
@@ -3768,10 +3738,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
char *src_kaddr;
- if (dst_page != src_page)
+ if (dst_page != src_page) {
src_kaddr = kmap_atomic(src_page, KM_USER1);
- else
+ } else {
src_kaddr = dst_kaddr;
+ BUG_ON(areas_overlap(src_off, dst_off, len));
+ }
memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3846,7 +3818,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
"len %lu len %lu\n", dst_offset, len, dst->len);
BUG_ON(1);
}
- if (dst_offset < src_offset) {
+ if (!areas_overlap(src_offset, dst_offset, len)) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7083cfafd061..4e8445a4757c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -31,6 +31,7 @@
#define EXTENT_BUFFER_UPTODATE 0
#define EXTENT_BUFFER_BLOCKING 1
#define EXTENT_BUFFER_DIRTY 2
+#define EXTENT_BUFFER_CORRUPT 3
/* these are flags for extent_clear_unlock_delalloc */
#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -152,23 +153,14 @@ static inline int extent_compress_type(unsigned long bio_flags)
struct extent_map_tree;
-static inline struct extent_state *extent_state_next(struct extent_state *state)
-{
- struct rb_node *node;
- node = rb_next(&state->rb_node);
- if (!node)
- return NULL;
- return rb_entry(node, struct extent_state, rb_node);
-}
-
typedef struct extent_map *(get_extent_t)(struct inode *inode,
struct page *page,
- size_t page_offset,
+ size_t pg_offset,
u64 start, u64 len,
int create);
void extent_io_tree_init(struct extent_io_tree *tree,
- struct address_space *mapping, gfp_t mask);
+ struct address_space *mapping);
int try_release_extent_mapping(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
@@ -191,7 +183,7 @@ void extent_io_exit(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, unsigned long bits);
+ u64 max_bytes, unsigned long bits, int contig);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -207,21 +199,15 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int exclusive_bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
-int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, int bits);
struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
@@ -242,28 +228,17 @@ int extent_readpages(struct extent_io_tree *tree,
struct address_space *mapping,
struct list_head *pages, unsigned nr_pages,
get_extent_t get_extent);
-int extent_prepare_write(struct extent_io_tree *tree,
- struct inode *inode, struct page *page,
- unsigned from, unsigned to, get_extent_t *get_extent);
-int extent_commit_write(struct extent_io_tree *tree,
- struct inode *inode, struct page *page,
- unsigned from, unsigned to);
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
- get_extent_t *get_extent);
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent);
-int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len,
- struct page *page0,
- gfp_t mask);
+ struct page *page0);
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len,
- gfp_t mask);
+ u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb);
int read_extent_buffer_pages(struct extent_io_tree *tree,
struct extent_buffer *eb, u64 start, int wait,
@@ -291,16 +266,11 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_offset, unsigned long len);
void memset_extent_buffer(struct extent_buffer *eb, char c,
unsigned long start, unsigned long len);
-int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
- struct extent_buffer *eb);
-int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
int clear_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb);
-int test_extent_buffer_dirty(struct extent_io_tree *tree,
- struct extent_buffer *eb);
int set_extent_buffer_uptodate(struct extent_io_tree *tree,
struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -318,7 +288,6 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long *map_start,
unsigned long *map_len, int km);
void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
-int release_extent_buffer_tail_pages(struct extent_buffer *eb);
int extent_range_uptodate(struct extent_io_tree *tree,
u64 start, u64 end);
int extent_clear_unlock_delalloc(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b0e1fce12530..2d0410344ea3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -28,12 +28,11 @@ void extent_map_exit(void)
/**
* extent_map_tree_init - initialize extent map tree
* @tree: tree to initialize
- * @mask: flags for memory allocations during tree operations
*
* Initialize the extent tree @tree. Should be called for each new inode
* or other user of the extent_map interface.
*/
-void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+void extent_map_tree_init(struct extent_map_tree *tree)
{
tree->map = RB_ROOT;
rwlock_init(&tree->lock);
@@ -41,18 +40,17 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
/**
* alloc_extent_map - allocate new extent map structure
- * @mask: memory allocation flags
*
* Allocate a new extent_map structure. The new structure is
* returned with a reference count of one and needs to be
* freed using free_extent_map()
*/
-struct extent_map *alloc_extent_map(gfp_t mask)
+struct extent_map *alloc_extent_map(void)
{
struct extent_map *em;
- em = kmem_cache_alloc(extent_map_cache, mask);
- if (!em || IS_ERR(em))
- return em;
+ em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+ if (!em)
+ return NULL;
em->in_tree = 0;
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
@@ -243,7 +241,7 @@ out:
* Insert @em into @tree or perform a simple forward/backward merge with
* existing mappings. The extent_map struct passed in will be inserted
* into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was successfull.
+ * reference dropped if the merge attempt was successful.
*/
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em)
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 28b44dbd1e35..33a7890b1f40 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -49,14 +49,14 @@ static inline u64 extent_map_block_end(struct extent_map *em)
return em->block_start + em->block_len;
}
-void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
+void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em);
int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
-struct extent_map *alloc_extent_map(gfp_t mask);
+struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 4f19a3e1bf32..90d4ee52cd45 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -48,7 +48,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
file_key.objectid = objectid;
file_key.offset = pos;
btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
@@ -169,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
if (bio->bi_size > PAGE_CACHE_SIZE * 8)
path->reada = 2;
@@ -190,7 +193,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
u32 item_size;
if (item)
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
path, disk_bytenr, 0);
if (IS_ERR(item)) {
@@ -205,12 +208,13 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
EXTENT_NODATASUM, GFP_NOFS);
} else {
printk(KERN_INFO "btrfs no csum found "
- "for inode %lu start %llu\n",
- inode->i_ino,
+ "for inode %llu start %llu\n",
+ (unsigned long long)
+ btrfs_ino(inode),
(unsigned long long)offset);
}
item = NULL;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto found;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
@@ -263,7 +267,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
- struct list_head *list)
+ struct list_head *list, int search_commit)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -280,6 +284,12 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
path = btrfs_alloc_path();
BUG_ON(!path);
+ if (search_commit) {
+ path->skip_locking = 1;
+ path->reada = 2;
+ path->search_commit_root = 1;
+ }
+
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.offset = start;
key.type = BTRFS_EXTENT_CSUM_KEY;
@@ -492,7 +502,6 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
u32 new_size = (bytenr - key->offset) >> blocksize_bits;
new_size *= csum_size;
ret = btrfs_truncate_item(trans, root, path, new_size, 1);
- BUG_ON(ret);
} else if (key->offset >= bytenr && csum_end > end_byte &&
end_byte > key->offset) {
/*
@@ -505,7 +514,6 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
new_size *= csum_size;
ret = btrfs_truncate_item(trans, root, path, new_size, 0);
- BUG_ON(ret);
key->offset = end_byte;
ret = btrfs_set_item_key_safe(trans, root, path, key);
@@ -548,10 +556,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
if (path->slots[0] == 0)
- goto out;
+ break;
path->slots[0]--;
} else if (ret < 0) {
- goto out;
+ break;
}
leaf = path->nodes[0];
@@ -576,7 +584,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
/* delete the entire item, it is inside our range */
if (key.offset >= bytenr && csum_end <= end_byte) {
ret = btrfs_del_item(trans, root, path);
- BUG_ON(ret);
+ if (ret)
+ goto out;
if (key.offset == bytenr)
break;
} else if (key.offset < bytenr && csum_end > end_byte) {
@@ -628,11 +637,12 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
if (key.offset < bytenr)
break;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
}
+ ret = 0;
out:
btrfs_free_path(path);
- return 0;
+ return ret;
}
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
@@ -719,7 +729,7 @@ again:
* at this point, we know the tree has an item, but it isn't big
* enough yet to put our csum in. Grow it
*/
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, &file_key, path,
csum_size, 1);
if (ret < 0)
@@ -758,12 +768,11 @@ again:
goto insert;
ret = btrfs_extend_item(trans, root, path, diff);
- BUG_ON(ret);
goto csum;
}
insert:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
csum_offset = 0;
if (found_next) {
u64 tmp = total_bytes + root->sectorsize;
@@ -847,7 +856,7 @@ next_sector:
}
btrfs_mark_buffer_dirty(path->nodes[0]);
if (total_bytes < sums->len) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
cond_resched();
goto again;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c1d3a818731a..c6a22d783c35 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -40,19 +40,276 @@
#include "locking.h"
#include "compat.h"
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+ struct rb_node rb_node;
+ /* objectid */
+ u64 ino;
+ /*
+ * transid where the defrag was added, we search for
+ * extents newer than this
+ */
+ u64 transid;
+
+ /* root objectid */
+ u64 root;
+
+ /* last offset we were able to defrag */
+ u64 last_offset;
+
+ /* if we've wrapped around back to zero once already */
+ int cycled;
+};
+
+/* pop a record for an inode into the defrag tree. The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+ struct inode_defrag *defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode_defrag *entry;
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+
+ p = &root->fs_info->defrag_inodes.rb_node;
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ if (defrag->ino < entry->ino)
+ p = &parent->rb_left;
+ else if (defrag->ino > entry->ino)
+ p = &parent->rb_right;
+ else {
+ /* if we're reinserting an entry for
+ * an old defrag run, make sure to
+ * lower the transid of our existing record
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ if (defrag->last_offset > entry->last_offset)
+ entry->last_offset = defrag->last_offset;
+ goto exists;
+ }
+ }
+ BTRFS_I(inode)->in_defrag = 1;
+ rb_link_node(&defrag->rb_node, parent, p);
+ rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+ return 0;
+
+exists:
+ kfree(defrag);
+ return 0;
+
+}
+
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode_defrag *defrag;
+ int ret = 0;
+ u64 transid;
+
+ if (!btrfs_test_opt(root, AUTO_DEFRAG))
+ return 0;
+
+ if (root->fs_info->closing)
+ return 0;
+
+ if (BTRFS_I(inode)->in_defrag)
+ return 0;
+
+ if (trans)
+ transid = trans->transid;
+ else
+ transid = BTRFS_I(inode)->root->last_trans;
+
+ defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+ if (!defrag)
+ return -ENOMEM;
+
+ defrag->ino = inode->i_ino;
+ defrag->transid = transid;
+ defrag->root = root->root_key.objectid;
+
+ spin_lock(&root->fs_info->defrag_inodes_lock);
+ if (!BTRFS_I(inode)->in_defrag)
+ ret = __btrfs_add_inode_defrag(inode, defrag);
+ spin_unlock(&root->fs_info->defrag_inodes_lock);
+ return ret;
+}
+
+/*
+ * must be called with the defrag_inodes lock held
+ */
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+ struct rb_node **next)
+{
+ struct inode_defrag *entry = NULL;
+ struct rb_node *p;
+ struct rb_node *parent = NULL;
+
+ p = info->defrag_inodes.rb_node;
+ while (p) {
+ parent = p;
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+
+ if (ino < entry->ino)
+ p = parent->rb_left;
+ else if (ino > entry->ino)
+ p = parent->rb_right;
+ else
+ return entry;
+ }
+
+ if (next) {
+ while (parent && ino > entry->ino) {
+ parent = rb_next(parent);
+ entry = rb_entry(parent, struct inode_defrag, rb_node);
+ }
+ *next = parent;
+ }
+ return NULL;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+ struct inode_defrag *defrag;
+ struct btrfs_root *inode_root;
+ struct inode *inode;
+ struct rb_node *n;
+ struct btrfs_key key;
+ struct btrfs_ioctl_defrag_range_args range;
+ u64 first_ino = 0;
+ int num_defrag;
+ int defrag_batch = 1024;
+
+ memset(&range, 0, sizeof(range));
+ range.len = (u64)-1;
+
+ atomic_inc(&fs_info->defrag_running);
+ spin_lock(&fs_info->defrag_inodes_lock);
+ while(1) {
+ n = NULL;
+
+ /* find an inode to defrag */
+ defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+ if (!defrag) {
+ if (n)
+ defrag = rb_entry(n, struct inode_defrag, rb_node);
+ else if (first_ino) {
+ first_ino = 0;
+ continue;
+ } else {
+ break;
+ }
+ }
+
+ /* remove it from the rbtree */
+ first_ino = defrag->ino + 1;
+ rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
+
+ if (fs_info->closing)
+ goto next_free;
+
+ spin_unlock(&fs_info->defrag_inodes_lock);
+
+ /* get the inode */
+ key.objectid = defrag->root;
+ btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+ inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(inode_root))
+ goto next;
+
+ key.objectid = defrag->ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+
+ inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+ if (IS_ERR(inode))
+ goto next;
+
+ /* do a chunk of defrag */
+ BTRFS_I(inode)->in_defrag = 0;
+ range.start = defrag->last_offset;
+ num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ defrag_batch);
+ /*
+ * if we filled the whole defrag batch, there
+ * must be more work to do. Queue this defrag
+ * again
+ */
+ if (num_defrag == defrag_batch) {
+ defrag->last_offset = range.start;
+ __btrfs_add_inode_defrag(inode, defrag);
+ /*
+ * we don't want to kfree defrag, we added it back to
+ * the rbtree
+ */
+ defrag = NULL;
+ } else if (defrag->last_offset && !defrag->cycled) {
+ /*
+ * we didn't fill our defrag batch, but
+ * we didn't start at zero. Make sure we loop
+ * around to the start of the file.
+ */
+ defrag->last_offset = 0;
+ defrag->cycled = 1;
+ __btrfs_add_inode_defrag(inode, defrag);
+ defrag = NULL;
+ }
+
+ iput(inode);
+next:
+ spin_lock(&fs_info->defrag_inodes_lock);
+next_free:
+ kfree(defrag);
+ }
+ spin_unlock(&fs_info->defrag_inodes_lock);
+
+ atomic_dec(&fs_info->defrag_running);
+
+ /*
+ * during unmount, we use the transaction_wait queue to
+ * wait for the defragger to stop
+ */
+ wake_up(&fs_info->transaction_wait);
+ return 0;
+}
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
*/
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
- int write_bytes,
+ size_t write_bytes,
struct page **prepared_pages,
struct iov_iter *i)
{
size_t copied = 0;
+ size_t total_copied = 0;
int pg = 0;
int offset = pos & (PAGE_CACHE_SIZE - 1);
- int total_copied = 0;
while (write_bytes > 0) {
size_t count = min_t(size_t,
@@ -70,14 +327,26 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
/* Flush processor's dcache for this page */
flush_dcache_page(page);
+
+ /*
+ * if we get a partial write, we can end up with
+ * partially up to date pages. These add
+ * a lot of complexity, so make sure they don't
+ * happen by forcing this copy to be retried.
+ *
+ * The rest of the btrfs_file_write code will fall
+ * back to page at a time copies after we return 0.
+ */
+ if (!PageUptodate(page) && copied < count)
+ copied = 0;
+
iov_iter_advance(i, copied);
write_bytes -= copied;
total_copied += copied;
/* Return to btrfs_file_aio_write to fault page */
- if (unlikely(copied == 0)) {
+ if (unlikely(copied == 0))
break;
- }
if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
offset += copied;
@@ -92,12 +361,10 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
/*
* unlocks pages after btrfs_file_write is done with them
*/
-static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+void btrfs_drop_pages(struct page **pages, size_t num_pages)
{
size_t i;
for (i = 0; i < num_pages; i++) {
- if (!pages[i])
- break;
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
* clear it here
@@ -117,17 +384,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
* this also makes the decision about creating an inline extent vs
* doing real data extents, marking pages dirty and delalloc as required.
*/
-static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct file *file,
- struct page **pages,
- size_t num_pages,
- loff_t pos,
- size_t write_bytes)
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+ struct page **pages, size_t num_pages,
+ loff_t pos, size_t write_bytes,
+ struct extent_state **cached)
{
int err = 0;
int i;
- struct inode *inode = fdentry(file)->d_inode;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
@@ -140,8 +403,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
- NULL);
- BUG_ON(err);
+ cached);
+ if (err)
+ return err;
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
@@ -149,13 +413,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
ClearPageChecked(p);
set_page_dirty(p);
}
- if (end_pos > isize) {
+
+ /*
+ * we've only changed i_size in ram, and we haven't updated
+ * the disk i_size. There is no need to log the inode
+ * at this time.
+ */
+ if (end_pos > isize)
i_size_write(inode, end_pos);
- /* we've only changed i_size in ram, and we haven't updated
- * the disk i_size. There is no need to log the inode
- * at this time.
- */
- }
return 0;
}
@@ -183,9 +448,10 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
}
while (1) {
if (!split)
- split = alloc_extent_map(GFP_NOFS);
+ split = alloc_extent_map();
if (!split2)
- split2 = alloc_extent_map(GFP_NOFS);
+ split2 = alloc_extent_map();
+ BUG_ON(!split || !split2);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
@@ -289,6 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key new_key;
+ u64 ino = btrfs_ino(inode);
u64 search_start = start;
u64 disk_bytenr = 0;
u64 num_bytes = 0;
@@ -309,14 +576,14 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
while (1) {
recow = 0;
- ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ ret = btrfs_lookup_file_extent(trans, root, path, ino,
search_start, -1);
if (ret < 0)
break;
if (ret > 0 && path->slots[0] > 0 && search_start == start) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
- if (key.objectid == inode->i_ino &&
+ if (key.objectid == ino &&
key.type == BTRFS_EXTENT_DATA_KEY)
path->slots[0]--;
}
@@ -337,7 +604,7 @@ next_slot:
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid > inode->i_ino ||
+ if (key.objectid > ino ||
key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
break;
@@ -367,7 +634,7 @@ next_slot:
search_start = max(key.offset, start);
if (recow) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
continue;
}
@@ -384,7 +651,7 @@ next_slot:
ret = btrfs_duplicate_item(trans, root, path,
&new_key);
if (ret == -EAGAIN) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
continue;
}
if (ret < 0)
@@ -507,7 +774,7 @@ next_slot:
del_nr = 0;
del_slot = 0;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
continue;
}
@@ -583,6 +850,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
int del_slot = 0;
int recow;
int ret;
+ u64 ino = btrfs_ino(inode);
btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -591,18 +859,19 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
again:
recow = 0;
split = start;
- key.objectid = inode->i_ino;
+ key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = split;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto out;
if (ret > 0 && path->slots[0] > 0)
path->slots[0]--;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- BUG_ON(key.objectid != inode->i_ino ||
- key.type != BTRFS_EXTENT_DATA_KEY);
+ BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
BUG_ON(btrfs_file_extent_type(leaf, fi) !=
@@ -619,7 +888,7 @@ again:
other_start = 0;
other_end = start;
if (extent_mergeable(leaf, path->slots[0] - 1,
- inode->i_ino, bytenr, orig_offset,
+ ino, bytenr, orig_offset,
&other_start, &other_end)) {
new_key.offset = end;
btrfs_set_item_key_safe(trans, root, path, &new_key);
@@ -642,7 +911,7 @@ again:
other_start = end;
other_end = 0;
if (extent_mergeable(leaf, path->slots[0] + 1,
- inode->i_ino, bytenr, orig_offset,
+ ino, bytenr, orig_offset,
&other_start, &other_end)) {
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -670,7 +939,7 @@ again:
new_key.offset = split;
ret = btrfs_duplicate_item(trans, root, path, &new_key);
if (ret == -EAGAIN) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto again;
}
BUG_ON(ret < 0);
@@ -691,7 +960,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- inode->i_ino, orig_offset);
+ ino, orig_offset);
BUG_ON(ret);
if (split == start) {
@@ -707,10 +976,10 @@ again:
other_start = end;
other_end = 0;
if (extent_mergeable(leaf, path->slots[0] + 1,
- inode->i_ino, bytenr, orig_offset,
+ ino, bytenr, orig_offset,
&other_start, &other_end)) {
if (recow) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto again;
}
extent_end = other_end;
@@ -718,16 +987,16 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- inode->i_ino, orig_offset);
+ ino, orig_offset);
BUG_ON(ret);
}
other_start = 0;
other_end = start;
if (extent_mergeable(leaf, path->slots[0] - 1,
- inode->i_ino, bytenr, orig_offset,
+ ino, bytenr, orig_offset,
&other_start, &other_end)) {
if (recow) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto again;
}
key.offset = other_start;
@@ -735,7 +1004,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- inode->i_ino, orig_offset);
+ ino, orig_offset);
BUG_ON(ret);
}
if (del_nr == 0) {
@@ -762,6 +1031,27 @@ out:
}
/*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+ int ret = 0;
+
+ if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+ ret = btrfs_readpage(NULL, page);
+ if (ret)
+ return ret;
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ return -EIO;
+ }
+ }
+ return 0;
+}
+
+/*
* this gets pages into the page cache and locks them down, it also properly
* waits for data=ordered extents to finish before allowing the pages to be
* modified.
@@ -776,6 +1066,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
unsigned long index = pos >> PAGE_CACHE_SHIFT;
struct inode *inode = fdentry(file)->d_inode;
int err = 0;
+ int faili = 0;
u64 start_pos;
u64 last_pos;
@@ -783,25 +1074,33 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
if (start_pos > inode->i_size) {
- err = btrfs_cont_expand(inode, start_pos);
+ err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
if (err)
return err;
}
- memset(pages, 0, num_pages * sizeof(struct page *));
again:
for (i = 0; i < num_pages; i++) {
pages[i] = grab_cache_page(inode->i_mapping, index + i);
if (!pages[i]) {
- int c;
- for (c = i - 1; c >= 0; c--) {
- unlock_page(pages[c]);
- page_cache_release(pages[c]);
- }
- return -ENOMEM;
+ faili = i - 1;
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ if (i == 0)
+ err = prepare_uptodate_page(pages[i], pos);
+ if (i == num_pages - 1)
+ err = prepare_uptodate_page(pages[i],
+ pos + write_bytes);
+ if (err) {
+ page_cache_release(pages[i]);
+ faili = i - 1;
+ goto fail;
}
wait_on_page_writeback(pages[i]);
}
+ err = 0;
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -841,191 +1140,103 @@ again:
WARN_ON(!PageLocked(pages[i]));
}
return 0;
+fail:
+ while (faili >= 0) {
+ unlock_page(pages[faili]);
+ page_cache_release(pages[faili]);
+ faili--;
+ }
+ return err;
+
}
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static noinline ssize_t __btrfs_buffered_write(struct file *file,
+ struct iov_iter *i,
+ loff_t pos)
{
- struct file *file = iocb->ki_filp;
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct page *pinned[2];
struct page **pages = NULL;
- struct iov_iter i;
- loff_t *ppos = &iocb->ki_pos;
- loff_t start_pos;
- ssize_t num_written = 0;
- ssize_t err = 0;
- size_t count;
- size_t ocount;
- int ret = 0;
- int nrptrs;
unsigned long first_index;
unsigned long last_index;
- int will_write;
- int buffered = 0;
- int copied = 0;
- int dirty_pages = 0;
-
- will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
- (file->f_flags & O_DIRECT));
-
- pinned[0] = NULL;
- pinned[1] = NULL;
-
- start_pos = pos;
-
- vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
- mutex_lock(&inode->i_mutex);
-
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err)
- goto out;
- count = ocount;
-
- current->backing_dev_info = inode->i_mapping->backing_dev_info;
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err)
- goto out;
-
- if (count == 0)
- goto out;
-
- err = file_remove_suid(file);
- if (err)
- goto out;
-
- /*
- * If BTRFS flips readonly due to some impossible error
- * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
- * although we have opened a file as writable, we have
- * to stop this write operation to ensure FS consistency.
- */
- if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
- err = -EROFS;
- goto out;
- }
-
- file_update_time(file);
- BTRFS_I(inode)->sequence++;
-
- if (unlikely(file->f_flags & O_DIRECT)) {
- num_written = generic_file_direct_write(iocb, iov, &nr_segs,
- pos, ppos, count,
- ocount);
- /*
- * the generic O_DIRECT will update in-memory i_size after the
- * DIOs are done. But our endio handlers that update the on
- * disk i_size never update past the in memory i_size. So we
- * need one more update here to catch any additions to the
- * file
- */
- if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- mark_inode_dirty(inode);
- }
-
- if (num_written < 0) {
- ret = num_written;
- num_written = 0;
- goto out;
- } else if (num_written == count) {
- /* pick up pos changes done by the generic code */
- pos = *ppos;
- goto out;
- }
- /*
- * We are going to do buffered for the rest of the range, so we
- * need to make sure to invalidate the buffered pages when we're
- * done.
- */
- buffered = 1;
- pos += num_written;
- }
+ size_t num_written = 0;
+ int nrptrs;
+ int ret = 0;
- iov_iter_init(&i, iov, nr_segs, count, num_written);
- nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+ nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
(sizeof(struct page *)));
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* generic_write_checks can change our pos */
- start_pos = pos;
+ if (!pages)
+ return -ENOMEM;
first_index = pos >> PAGE_CACHE_SHIFT;
- last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
+ last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
- /*
- * there are lots of better ways to do this, but this code
- * makes sure the first and last page in the file range are
- * up to date and ready for cow
- */
- if ((pos & (PAGE_CACHE_SIZE - 1))) {
- pinned[0] = grab_cache_page(inode->i_mapping, first_index);
- if (!PageUptodate(pinned[0])) {
- ret = btrfs_readpage(NULL, pinned[0]);
- BUG_ON(ret);
- wait_on_page_locked(pinned[0]);
- } else {
- unlock_page(pinned[0]);
- }
- }
- if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
- pinned[1] = grab_cache_page(inode->i_mapping, last_index);
- if (!PageUptodate(pinned[1])) {
- ret = btrfs_readpage(NULL, pinned[1]);
- BUG_ON(ret);
- wait_on_page_locked(pinned[1]);
- } else {
- unlock_page(pinned[1]);
- }
- }
-
- while (iov_iter_count(&i) > 0) {
+ while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
- size_t write_bytes = min(iov_iter_count(&i),
+ size_t write_bytes = min(iov_iter_count(i),
nrptrs * (size_t)PAGE_CACHE_SIZE -
offset);
size_t num_pages = (write_bytes + offset +
PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ size_t dirty_pages;
+ size_t copied;
WARN_ON(num_pages > nrptrs);
- memset(pages, 0, sizeof(struct page *) * nrptrs);
/*
* Fault pages before locking them in prepare_pages
* to avoid recursive lock
*/
- if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
+ if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
ret = -EFAULT;
- goto out;
+ break;
}
ret = btrfs_delalloc_reserve_space(inode,
num_pages << PAGE_CACHE_SHIFT);
if (ret)
- goto out;
+ break;
+ /*
+ * This is going to setup the pages array with the number of
+ * pages we want, so we don't really need to worry about the
+ * contents of pages from loop to loop
+ */
ret = prepare_pages(root, file, pages, num_pages,
pos, first_index, last_index,
write_bytes);
if (ret) {
btrfs_delalloc_release_space(inode,
num_pages << PAGE_CACHE_SHIFT);
- goto out;
+ break;
}
copied = btrfs_copy_from_user(pos, num_pages,
- write_bytes, pages, &i);
- dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
+ write_bytes, pages, i);
+ /*
+ * if we have trouble faulting in the pages, fall
+ * back to one page at a time
+ */
+ if (copied < write_bytes)
+ nrptrs = 1;
+
+ if (copied == 0)
+ dirty_pages = 0;
+ else
+ dirty_pages = (copied + offset +
+ PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ /*
+ * If we had a short copy we need to release the excess delaloc
+ * bytes we reserved. We need to increment outstanding_extents
+ * because btrfs_delalloc_release_space will decrement it, but
+ * we still have an outstanding extent for the chunk we actually
+ * managed to copy.
+ */
if (num_pages > dirty_pages) {
if (copied > 0)
atomic_inc(
@@ -1036,43 +1247,157 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
}
if (copied > 0) {
- dirty_and_release_pages(NULL, root, file, pages,
- dirty_pages, pos, copied);
+ ret = btrfs_dirty_pages(root, inode, pages,
+ dirty_pages, pos, copied,
+ NULL);
+ if (ret) {
+ btrfs_delalloc_release_space(inode,
+ dirty_pages << PAGE_CACHE_SHIFT);
+ btrfs_drop_pages(pages, num_pages);
+ break;
+ }
}
btrfs_drop_pages(pages, num_pages);
- if (copied > 0) {
- if (will_write) {
- filemap_fdatawrite_range(inode->i_mapping, pos,
- pos + copied - 1);
- } else {
- balance_dirty_pages_ratelimited_nr(
- inode->i_mapping,
- dirty_pages);
- if (dirty_pages <
- (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
- btrfs_btree_balance_dirty(root, 1);
- btrfs_throttle(root);
- }
- }
+ cond_resched();
+
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_pages);
+ if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+ btrfs_btree_balance_dirty(root, 1);
+ btrfs_throttle(root);
pos += copied;
num_written += copied;
+ }
- cond_resched();
+ kfree(pages);
+
+ return num_written ? num_written : ret;
+}
+
+static ssize_t __btrfs_direct_write(struct kiocb *iocb,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos,
+ loff_t *ppos, size_t count, size_t ocount)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct iov_iter i;
+ ssize_t written;
+ ssize_t written_buffered;
+ loff_t endbyte;
+ int err;
+
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+ count, ocount);
+
+ /*
+ * the generic O_DIRECT will update in-memory i_size after the
+ * DIOs are done. But our endio handlers that update the on
+ * disk i_size never update past the in memory i_size. So we
+ * need one more update here to catch any additions to the
+ * file
+ */
+ if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ mark_inode_dirty(inode);
+ }
+
+ if (written < 0 || written == count)
+ return written;
+
+ pos += written;
+ count -= written;
+ iov_iter_init(&i, iov, nr_segs, count, written);
+ written_buffered = __btrfs_buffered_write(file, &i, pos);
+ if (written_buffered < 0) {
+ err = written_buffered;
+ goto out;
}
+ endbyte = pos + written_buffered - 1;
+ err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ if (err)
+ goto out;
+ written += written_buffered;
+ *ppos = pos + written_buffered;
+ invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
out:
- mutex_unlock(&inode->i_mutex);
- if (ret)
- err = ret;
+ return written ? written : err;
+}
- kfree(pages);
- if (pinned[0])
- page_cache_release(pinned[0]);
- if (pinned[1])
- page_cache_release(pinned[1]);
- *ppos = pos;
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ loff_t *ppos = &iocb->ki_pos;
+ ssize_t num_written = 0;
+ ssize_t err = 0;
+ size_t count, ocount;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ mutex_lock(&inode->i_mutex);
+
+ err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+ count = ocount;
+
+ current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+
+ if (count == 0) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+
+ err = file_remove_suid(file);
+ if (err) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+
+ /*
+ * If BTRFS flips readonly due to some impossible error
+ * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+ * although we have opened a file as writable, we have
+ * to stop this write operation to ensure FS consistency.
+ */
+ if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+ mutex_unlock(&inode->i_mutex);
+ err = -EROFS;
+ goto out;
+ }
+
+ file_update_time(file);
+ BTRFS_I(inode)->sequence++;
+
+ if (unlikely(file->f_flags & O_DIRECT)) {
+ num_written = __btrfs_direct_write(iocb, iov, nr_segs,
+ pos, ppos, count, ocount);
+ } else {
+ struct iov_iter i;
+
+ iov_iter_init(&i, iov, nr_segs, count, num_written);
+
+ num_written = __btrfs_buffered_write(file, &i, pos);
+ if (num_written > 0)
+ *ppos = pos + num_written;
+ }
+
+ mutex_unlock(&inode->i_mutex);
/*
* we want to make sure fsync finds this change
@@ -1087,43 +1412,12 @@ out:
* one running right now.
*/
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
-
- if (num_written > 0 && will_write) {
- struct btrfs_trans_handle *trans;
-
- err = btrfs_wait_ordered_range(inode, start_pos, num_written);
- if (err)
+ if (num_written > 0 || num_written == -EIOCBQUEUED) {
+ err = generic_write_sync(file, pos, num_written);
+ if (err < 0 && num_written > 0)
num_written = err;
-
- if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- num_written = PTR_ERR(trans);
- goto done;
- }
- mutex_lock(&inode->i_mutex);
- ret = btrfs_log_dentry_safe(trans, root,
- file->f_dentry);
- mutex_unlock(&inode->i_mutex);
- if (ret == 0) {
- ret = btrfs_sync_log(trans, root);
- if (ret == 0)
- btrfs_end_transaction(trans, root);
- else
- btrfs_commit_transaction(trans, root);
- } else if (ret != BTRFS_NO_LOG_SYNC) {
- btrfs_commit_transaction(trans, root);
- } else {
- btrfs_end_transaction(trans, root);
- }
- }
- if (file->f_flags & O_DIRECT && buffered) {
- invalidate_mapping_pages(inode->i_mapping,
- start_pos >> PAGE_CACHE_SHIFT,
- (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
- }
}
-done:
+out:
current->backing_dev_info = NULL;
return num_written ? num_written : err;
}
@@ -1166,6 +1460,7 @@ int btrfs_sync_file(struct file *file, int datasync)
int ret = 0;
struct btrfs_trans_handle *trans;
+ trace_btrfs_sync_file(file, datasync);
/* we wait first, since the writeback may change the inode */
root->log_batch++;
@@ -1293,7 +1588,8 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out;
if (alloc_start > inode->i_size) {
- ret = btrfs_cont_expand(inode, alloc_start);
+ ret = btrfs_cont_expand(inode, i_size_read(inode),
+ alloc_start);
if (ret)
goto out;
}
@@ -1337,7 +1633,7 @@ static long btrfs_fallocate(struct file *file, int mode,
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
- BUG_ON(IS_ERR(em) || !em);
+ BUG_ON(IS_ERR_OR_NULL(em));
last_byte = min(extent_map_end(em), alloc_end);
last_byte = (last_byte + mask) & ~mask;
if (em->block_start == EXTENT_MAP_HOLE ||
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a0390657451b..70d45795d758 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -24,18 +24,18 @@
#include "free-space-cache.h"
#include "transaction.h"
#include "disk-io.h"
+#include "extent_io.h"
+#include "inode-map.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-static void recalculate_thresholds(struct btrfs_block_group_cache
- *block_group);
-static int link_free_space(struct btrfs_block_group_cache *block_group,
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info);
-struct inode *lookup_free_space_inode(struct btrfs_root *root,
- struct btrfs_block_group_cache
- *block_group, struct btrfs_path *path)
+static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 offset)
{
struct btrfs_key key;
struct btrfs_key location;
@@ -45,22 +45,15 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
struct inode *inode = NULL;
int ret;
- spin_lock(&block_group->lock);
- if (block_group->inode)
- inode = igrab(block_group->inode);
- spin_unlock(&block_group->lock);
- if (inode)
- return inode;
-
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = block_group->key.objectid;
+ key.offset = offset;
key.type = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ERR_PTR(ret);
if (ret > 0) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return ERR_PTR(-ENOENT);
}
@@ -69,7 +62,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header);
btrfs_free_space_key(leaf, header, &disk_key);
btrfs_disk_key_to_cpu(&location, &disk_key);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
if (!inode)
@@ -81,6 +74,29 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
return ERR_PTR(-ENOENT);
}
+ inode->i_mapping->flags &= ~__GFP_FS;
+
+ return inode;
+}
+
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+ struct btrfs_block_group_cache
+ *block_group, struct btrfs_path *path)
+{
+ struct inode *inode = NULL;
+
+ spin_lock(&block_group->lock);
+ if (block_group->inode)
+ inode = igrab(block_group->inode);
+ spin_unlock(&block_group->lock);
+ if (inode)
+ return inode;
+
+ inode = __lookup_free_space_inode(root, path,
+ block_group->key.objectid);
+ if (IS_ERR(inode))
+ return inode;
+
spin_lock(&block_group->lock);
if (!root->fs_info->closing) {
block_group->inode = igrab(inode);
@@ -91,24 +107,18 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
return inode;
}
-int create_free_space_inode(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_path *path)
+int __create_free_space_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 ino, u64 offset)
{
struct btrfs_key key;
struct btrfs_disk_key disk_key;
struct btrfs_free_space_header *header;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- u64 objectid;
int ret;
- ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
- if (ret < 0)
- return ret;
-
- ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+ ret = btrfs_insert_empty_inode(trans, root, path, ino);
if (ret)
return ret;
@@ -128,19 +138,18 @@ int create_free_space_inode(struct btrfs_root *root,
BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
btrfs_set_inode_nlink(leaf, inode_item, 1);
btrfs_set_inode_transid(leaf, inode_item, trans->transid);
- btrfs_set_inode_block_group(leaf, inode_item,
- block_group->key.objectid);
+ btrfs_set_inode_block_group(leaf, inode_item, offset);
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = block_group->key.objectid;
+ key.offset = offset;
key.type = 0;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_free_space_header));
if (ret < 0) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return ret;
}
leaf = path->nodes[0];
@@ -149,11 +158,27 @@ int create_free_space_inode(struct btrfs_root *root,
memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
btrfs_set_free_space_key(leaf, header, &disk_key);
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return 0;
}
+int create_free_space_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ int ret;
+ u64 ino;
+
+ ret = btrfs_find_free_objectid(root, &ino);
+ if (ret < 0)
+ return ret;
+
+ return __create_free_space_inode(root, trans, path, ino,
+ block_group->key.objectid);
+}
+
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_path *path,
@@ -184,7 +209,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
return ret;
}
- return btrfs_update_inode(trans, root, inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ return ret;
}
static int readahead_cache(struct inode *inode)
@@ -206,15 +232,13 @@ static int readahead_cache(struct inode *inode)
return 0;
}
-int load_free_space_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *block_group)
+int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_path *path, u64 offset)
{
- struct btrfs_root *root = fs_info->tree_root;
- struct inode *inode;
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct page *page;
- struct btrfs_path *path;
u32 *checksums = NULL, *crc;
char *disk_crcs = NULL;
struct btrfs_key key;
@@ -226,71 +250,43 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
pgoff_t index = 0;
unsigned long first_page_offset;
int num_checksums;
- int ret = 0;
-
- /*
- * If we're unmounting then just return, since this does a search on the
- * normal root and not the commit root and we could deadlock.
- */
- smp_mb();
- if (fs_info->closing)
- return 0;
-
- /*
- * If this block group has been marked to be cleared for one reason or
- * another then we can't trust the on disk cache, so just return.
- */
- spin_lock(&block_group->lock);
- if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
- spin_unlock(&block_group->lock);
- return 0;
- }
- spin_unlock(&block_group->lock);
+ int ret = 0, ret2;
INIT_LIST_HEAD(&bitmaps);
- path = btrfs_alloc_path();
- if (!path)
- return 0;
-
- inode = lookup_free_space_inode(root, block_group, path);
- if (IS_ERR(inode)) {
- btrfs_free_path(path);
- return 0;
- }
-
/* Nothing in the space cache, goodbye */
- if (!i_size_read(inode)) {
- btrfs_free_path(path);
+ if (!i_size_read(inode))
goto out;
- }
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = block_group->key.objectid;
+ key.offset = offset;
key.type = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret) {
- btrfs_free_path(path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0) {
+ btrfs_release_path(path);
+ ret = 0;
goto out;
}
+ ret = -1;
+
leaf = path->nodes[0];
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
num_entries = btrfs_free_space_entries(leaf, header);
num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
generation = btrfs_free_space_generation(leaf, header);
- btrfs_free_path(path);
+ btrfs_release_path(path);
if (BTRFS_I(inode)->generation != generation) {
printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
- " not match free space cache generation (%llu) for "
- "block group %llu\n",
+ " not match free space cache generation (%llu)\n",
(unsigned long long)BTRFS_I(inode)->generation,
- (unsigned long long)generation,
- (unsigned long long)block_group->key.objectid);
- goto free_cache;
+ (unsigned long long)generation);
+ goto out;
}
if (!num_entries)
@@ -307,10 +303,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
goto out;
ret = readahead_cache(inode);
- if (ret) {
- ret = 0;
+ if (ret)
goto out;
- }
while (1) {
struct btrfs_free_space_entry *entry;
@@ -329,10 +323,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
}
page = grab_cache_page(inode->i_mapping, index);
- if (!page) {
- ret = 0;
+ if (!page)
goto free_cache;
- }
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
@@ -341,9 +333,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
unlock_page(page);
page_cache_release(page);
printk(KERN_ERR "btrfs: error reading free "
- "space cache: %llu\n",
- (unsigned long long)
- block_group->key.objectid);
+ "space cache\n");
goto free_cache;
}
}
@@ -356,13 +346,10 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
gen = addr + (sizeof(u32) * num_checksums);
if (*gen != BTRFS_I(inode)->generation) {
printk(KERN_ERR "btrfs: space cache generation"
- " (%llu) does not match inode (%llu) "
- "for block group %llu\n",
+ " (%llu) does not match inode (%llu)\n",
(unsigned long long)*gen,
(unsigned long long)
- BTRFS_I(inode)->generation,
- (unsigned long long)
- block_group->key.objectid);
+ BTRFS_I(inode)->generation);
kunmap(page);
unlock_page(page);
page_cache_release(page);
@@ -378,9 +365,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
PAGE_CACHE_SIZE - start_offset);
btrfs_csum_final(cur_crc, (char *)&cur_crc);
if (cur_crc != *crc) {
- printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
- "block group %llu\n", index,
- (unsigned long long)block_group->key.objectid);
+ printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
+ index);
kunmap(page);
unlock_page(page);
page_cache_release(page);
@@ -393,7 +379,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
break;
need_loop = 1;
- e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+ e = kmem_cache_zalloc(btrfs_free_space_cachep,
+ GFP_NOFS);
if (!e) {
kunmap(page);
unlock_page(page);
@@ -405,31 +392,32 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
e->bytes = le64_to_cpu(entry->bytes);
if (!e->bytes) {
kunmap(page);
- kfree(e);
+ kmem_cache_free(btrfs_free_space_cachep, e);
unlock_page(page);
page_cache_release(page);
goto free_cache;
}
if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
- spin_lock(&block_group->tree_lock);
- ret = link_free_space(block_group, e);
- spin_unlock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
+ ret = link_free_space(ctl, e);
+ spin_unlock(&ctl->tree_lock);
BUG_ON(ret);
} else {
e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
if (!e->bitmap) {
kunmap(page);
- kfree(e);
+ kmem_cache_free(
+ btrfs_free_space_cachep, e);
unlock_page(page);
page_cache_release(page);
goto free_cache;
}
- spin_lock(&block_group->tree_lock);
- ret = link_free_space(block_group, e);
- block_group->total_bitmaps++;
- recalculate_thresholds(block_group);
- spin_unlock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
+ ret2 = link_free_space(ctl, e);
+ ctl->total_bitmaps++;
+ ctl->op->recalc_thresholds(ctl);
+ spin_unlock(&ctl->tree_lock);
list_add_tail(&e->list, &bitmaps);
}
@@ -469,85 +457,156 @@ next:
out:
kfree(checksums);
kfree(disk_crcs);
- iput(inode);
return ret;
-
free_cache:
- /* This cache is bogus, make sure it gets cleared */
+ __btrfs_remove_free_space_cache(ctl);
+ goto out;
+}
+
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_root *root = fs_info->tree_root;
+ struct inode *inode;
+ struct btrfs_path *path;
+ int ret;
+ bool matched;
+ u64 used = btrfs_block_group_used(&block_group->item);
+
+ /*
+ * If we're unmounting then just return, since this does a search on the
+ * normal root and not the commit root and we could deadlock.
+ */
+ smp_mb();
+ if (fs_info->closing)
+ return 0;
+
+ /*
+ * If this block group has been marked to be cleared for one reason or
+ * another then we can't trust the on disk cache, so just return.
+ */
spin_lock(&block_group->lock);
- block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
spin_unlock(&block_group->lock);
- btrfs_remove_free_space_cache(block_group);
- goto out;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return 0;
+
+ inode = lookup_free_space_inode(root, block_group, path);
+ if (IS_ERR(inode)) {
+ btrfs_free_path(path);
+ return 0;
+ }
+
+ ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+ path, block_group->key.objectid);
+ btrfs_free_path(path);
+ if (ret <= 0)
+ goto out;
+
+ spin_lock(&ctl->tree_lock);
+ matched = (ctl->free_space == (block_group->key.offset - used -
+ block_group->bytes_super));
+ spin_unlock(&ctl->tree_lock);
+
+ if (!matched) {
+ __btrfs_remove_free_space_cache(ctl);
+ printk(KERN_ERR "block group %llu has an wrong amount of free "
+ "space\n", block_group->key.objectid);
+ ret = -1;
+ }
+out:
+ if (ret < 0) {
+ /* This cache is bogus, make sure it gets cleared */
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_CLEAR;
+ spin_unlock(&block_group->lock);
+ ret = 0;
+
+ printk(KERN_ERR "btrfs: failed to load free space cache "
+ "for block group %llu\n", block_group->key.objectid);
+ }
+
+ iput(inode);
+ return ret;
}
-int btrfs_write_out_cache(struct btrfs_root *root,
- struct btrfs_trans_handle *trans,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_path *path)
+int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 offset)
{
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
- struct inode *inode;
struct rb_node *node;
struct list_head *pos, *n;
+ struct page **pages;
struct page *page;
struct extent_state *cached_state = NULL;
+ struct btrfs_free_cluster *cluster = NULL;
+ struct extent_io_tree *unpin = NULL;
struct list_head bitmap_list;
struct btrfs_key key;
+ u64 start, end, len;
u64 bytes = 0;
u32 *crc, *checksums;
- pgoff_t index = 0, last_index = 0;
unsigned long first_page_offset;
- int num_checksums;
+ int index = 0, num_pages = 0;
int entries = 0;
int bitmaps = 0;
- int ret = 0;
-
- root = root->fs_info->tree_root;
+ int ret = -1;
+ bool next_page = false;
+ bool out_of_space = false;
INIT_LIST_HEAD(&bitmap_list);
- spin_lock(&block_group->lock);
- if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
- spin_unlock(&block_group->lock);
- return 0;
- }
- spin_unlock(&block_group->lock);
-
- inode = lookup_free_space_inode(root, block_group, path);
- if (IS_ERR(inode))
- return 0;
-
- if (!i_size_read(inode)) {
- iput(inode);
+ node = rb_first(&ctl->free_space_offset);
+ if (!node)
return 0;
- }
- node = rb_first(&block_group->free_space_offset);
- if (!node) {
- iput(inode);
- return 0;
- }
+ if (!i_size_read(inode))
+ return -1;
- last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+ num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
filemap_write_and_wait(inode->i_mapping);
btrfs_wait_ordered_range(inode, inode->i_size &
~(root->sectorsize - 1), (u64)-1);
/* We need a checksum per page. */
- num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
- crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
- if (!crc) {
- iput(inode);
- return 0;
+ crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
+ if (!crc)
+ return -1;
+
+ pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+ if (!pages) {
+ kfree(crc);
+ return -1;
}
/* Since the first page has all of our checksums and our generation we
* need to calculate the offset into the page that we can start writing
* our entries.
*/
- first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+ first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
+
+ /* Get the cluster for this block_group if it exists */
+ if (block_group && !list_empty(&block_group->cluster_list))
+ cluster = list_entry(block_group->cluster_list.next,
+ struct btrfs_free_cluster,
+ block_group_list);
+
+ /*
+ * We shouldn't have switched the pinned extents yet so this is the
+ * right one
+ */
+ unpin = root->fs_info->pinned_extents;
/*
* Lock all pages first so we can lock the extent safely.
@@ -557,20 +616,18 @@ int btrfs_write_out_cache(struct btrfs_root *root,
* after find_get_page at this point. Just putting this here so people
* know and don't freak out.
*/
- while (index <= last_index) {
+ while (index < num_pages) {
page = grab_cache_page(inode->i_mapping, index);
if (!page) {
- pgoff_t i = 0;
+ int i;
- while (i < index) {
- page = find_get_page(inode->i_mapping, i);
- unlock_page(page);
- page_cache_release(page);
- page_cache_release(page);
- i++;
+ for (i = 0; i < num_pages; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
}
goto out_free;
}
+ pages[index] = page;
index++;
}
@@ -578,6 +635,13 @@ int btrfs_write_out_cache(struct btrfs_root *root,
lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state, GFP_NOFS);
+ /*
+ * When searching for pinned extents, we need to start at our start
+ * offset.
+ */
+ if (block_group)
+ start = block_group->key.objectid;
+
/* Write out the extent entries */
do {
struct btrfs_free_space_entry *entry;
@@ -585,18 +649,25 @@ int btrfs_write_out_cache(struct btrfs_root *root,
unsigned long offset = 0;
unsigned long start_offset = 0;
+ next_page = false;
+
if (index == 0) {
start_offset = first_page_offset;
offset = start_offset;
}
- page = find_get_page(inode->i_mapping, index);
+ if (index >= num_pages) {
+ out_of_space = true;
+ break;
+ }
+
+ page = pages[index];
addr = kmap(page);
entry = addr + start_offset;
memset(addr, 0, PAGE_CACHE_SIZE);
- while (1) {
+ while (node && !next_page) {
struct btrfs_free_space *e;
e = rb_entry(node, struct btrfs_free_space, offset_index);
@@ -612,12 +683,50 @@ int btrfs_write_out_cache(struct btrfs_root *root,
entry->type = BTRFS_FREE_SPACE_EXTENT;
}
node = rb_next(node);
- if (!node)
- break;
+ if (!node && cluster) {
+ node = rb_first(&cluster->root);
+ cluster = NULL;
+ }
offset += sizeof(struct btrfs_free_space_entry);
if (offset + sizeof(struct btrfs_free_space_entry) >=
PAGE_CACHE_SIZE)
+ next_page = true;
+ entry++;
+ }
+
+ /*
+ * We want to add any pinned extents to our free space cache
+ * so we don't leak the space
+ */
+ while (block_group && !next_page &&
+ (start < block_group->key.objectid +
+ block_group->key.offset)) {
+ ret = find_first_extent_bit(unpin, start, &start, &end,
+ EXTENT_DIRTY);
+ if (ret) {
+ ret = 0;
break;
+ }
+
+ /* This pinned extent is out of our range */
+ if (start >= block_group->key.objectid +
+ block_group->key.offset)
+ break;
+
+ len = block_group->key.objectid +
+ block_group->key.offset - start;
+ len = min(len, end + 1 - start);
+
+ entries++;
+ entry->offset = cpu_to_le64(start);
+ entry->bytes = cpu_to_le64(len);
+ entry->type = BTRFS_FREE_SPACE_EXTENT;
+
+ start = end + 1;
+ offset += sizeof(struct btrfs_free_space_entry);
+ if (offset + sizeof(struct btrfs_free_space_entry) >=
+ PAGE_CACHE_SIZE)
+ next_page = true;
entry++;
}
*crc = ~(u32)0;
@@ -630,25 +739,8 @@ int btrfs_write_out_cache(struct btrfs_root *root,
bytes += PAGE_CACHE_SIZE;
- ClearPageChecked(page);
- set_page_extent_mapped(page);
- SetPageUptodate(page);
- set_page_dirty(page);
-
- /*
- * We need to release our reference we got for grab_cache_page,
- * except for the first page which will hold our checksums, we
- * do that below.
- */
- if (index != 0) {
- unlock_page(page);
- page_cache_release(page);
- }
-
- page_cache_release(page);
-
index++;
- } while (node);
+ } while (node || next_page);
/* Write out the bitmaps */
list_for_each_safe(pos, n, &bitmap_list) {
@@ -656,7 +748,11 @@ int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_free_space *entry =
list_entry(pos, struct btrfs_free_space, list);
- page = find_get_page(inode->i_mapping, index);
+ if (index >= num_pages) {
+ out_of_space = true;
+ break;
+ }
+ page = pages[index];
addr = kmap(page);
memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
@@ -667,73 +763,67 @@ int btrfs_write_out_cache(struct btrfs_root *root,
crc++;
bytes += PAGE_CACHE_SIZE;
- ClearPageChecked(page);
- set_page_extent_mapped(page);
- SetPageUptodate(page);
- set_page_dirty(page);
- unlock_page(page);
- page_cache_release(page);
- page_cache_release(page);
list_del_init(&entry->list);
index++;
}
+ if (out_of_space) {
+ btrfs_drop_pages(pages, num_pages);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state,
+ GFP_NOFS);
+ ret = 0;
+ goto out_free;
+ }
+
/* Zero out the rest of the pages just to make sure */
- while (index <= last_index) {
+ while (index < num_pages) {
void *addr;
- page = find_get_page(inode->i_mapping, index);
-
+ page = pages[index];
addr = kmap(page);
memset(addr, 0, PAGE_CACHE_SIZE);
kunmap(page);
- ClearPageChecked(page);
- set_page_extent_mapped(page);
- SetPageUptodate(page);
- set_page_dirty(page);
- unlock_page(page);
- page_cache_release(page);
- page_cache_release(page);
bytes += PAGE_CACHE_SIZE;
index++;
}
- btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
-
/* Write the checksums and trans id to the first page */
{
void *addr;
u64 *gen;
- page = find_get_page(inode->i_mapping, 0);
+ page = pages[0];
addr = kmap(page);
- memcpy(addr, checksums, sizeof(u32) * num_checksums);
- gen = addr + (sizeof(u32) * num_checksums);
+ memcpy(addr, checksums, sizeof(u32) * num_pages);
+ gen = addr + (sizeof(u32) * num_pages);
*gen = trans->transid;
kunmap(page);
- ClearPageChecked(page);
- set_page_extent_mapped(page);
- SetPageUptodate(page);
- set_page_dirty(page);
- unlock_page(page);
- page_cache_release(page);
- page_cache_release(page);
}
- BTRFS_I(inode)->generation = trans->transid;
+ ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
+ bytes, &cached_state);
+ btrfs_drop_pages(pages, num_pages);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ if (ret) {
+ ret = 0;
+ goto out_free;
+ }
+
+ BTRFS_I(inode)->generation = trans->transid;
+
filemap_write_and_wait(inode->i_mapping);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = block_group->key.objectid;
+ key.offset = offset;
key.type = 0;
ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
if (ret < 0) {
- ret = 0;
+ ret = -1;
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
@@ -746,13 +836,13 @@ int btrfs_write_out_cache(struct btrfs_root *root,
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
- found_key.offset != block_group->key.objectid) {
- ret = 0;
+ found_key.offset != offset) {
+ ret = -1;
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING, 0, 0, NULL,
GFP_NOFS);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto out_free;
}
}
@@ -762,48 +852,83 @@ int btrfs_write_out_cache(struct btrfs_root *root,
btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
btrfs_set_free_space_generation(leaf, header, trans->transid);
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = 1;
out_free:
- if (ret == 0) {
+ if (ret != 1) {
invalidate_inode_pages2_range(inode->i_mapping, 0, index);
- spin_lock(&block_group->lock);
- block_group->disk_cache_state = BTRFS_DC_ERROR;
- spin_unlock(&block_group->lock);
BTRFS_I(inode)->generation = 0;
}
kfree(checksums);
+ kfree(pages);
btrfs_update_inode(trans, root, inode);
+ return ret;
+}
+
+int btrfs_write_out_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_path *path)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct inode *inode;
+ int ret = 0;
+
+ root = root->fs_info->tree_root;
+
+ spin_lock(&block_group->lock);
+ if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
+ spin_unlock(&block_group->lock);
+
+ inode = lookup_free_space_inode(root, block_group, path);
+ if (IS_ERR(inode))
+ return 0;
+
+ ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+ path, block_group->key.objectid);
+ if (ret < 0) {
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = BTRFS_DC_ERROR;
+ spin_unlock(&block_group->lock);
+ ret = 0;
+
+ printk(KERN_ERR "btrfs: failed to write free space cace "
+ "for block group %llu\n", block_group->key.objectid);
+ }
+
iput(inode);
return ret;
}
-static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
+static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
u64 offset)
{
BUG_ON(offset < bitmap_start);
offset -= bitmap_start;
- return (unsigned long)(div64_u64(offset, sectorsize));
+ return (unsigned long)(div_u64(offset, unit));
}
-static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
+static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
{
- return (unsigned long)(div64_u64(bytes, sectorsize));
+ return (unsigned long)(div_u64(bytes, unit));
}
-static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
+static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
u64 offset)
{
u64 bitmap_start;
u64 bytes_per_bitmap;
- bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
- bitmap_start = offset - block_group->key.objectid;
+ bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
+ bitmap_start = offset - ctl->start;
bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
bitmap_start *= bytes_per_bitmap;
- bitmap_start += block_group->key.objectid;
+ bitmap_start += ctl->start;
return bitmap_start;
}
@@ -861,10 +986,10 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
* offset.
*/
static struct btrfs_free_space *
-tree_search_offset(struct btrfs_block_group_cache *block_group,
+tree_search_offset(struct btrfs_free_space_ctl *ctl,
u64 offset, int bitmap_only, int fuzzy)
{
- struct rb_node *n = block_group->free_space_offset.rb_node;
+ struct rb_node *n = ctl->free_space_offset.rb_node;
struct btrfs_free_space *entry, *prev = NULL;
/* find entry that is closest to the 'offset' */
@@ -960,8 +1085,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
break;
}
}
- if (entry->offset + BITS_PER_BITMAP *
- block_group->sectorsize > offset)
+ if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
return entry;
} else if (entry->offset + entry->bytes > offset)
return entry;
@@ -972,7 +1096,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
while (1) {
if (entry->bitmap) {
if (entry->offset + BITS_PER_BITMAP *
- block_group->sectorsize > offset)
+ ctl->unit > offset)
break;
} else {
if (entry->offset + entry->bytes > offset)
@@ -988,42 +1112,47 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
}
static inline void
-__unlink_free_space(struct btrfs_block_group_cache *block_group,
+__unlink_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
- rb_erase(&info->offset_index, &block_group->free_space_offset);
- block_group->free_extents--;
+ rb_erase(&info->offset_index, &ctl->free_space_offset);
+ ctl->free_extents--;
}
-static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
- __unlink_free_space(block_group, info);
- block_group->free_space -= info->bytes;
+ __unlink_free_space(ctl, info);
+ ctl->free_space -= info->bytes;
}
-static int link_free_space(struct btrfs_block_group_cache *block_group,
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
int ret = 0;
BUG_ON(!info->bitmap && !info->bytes);
- ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+ ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
&info->offset_index, (info->bitmap != NULL));
if (ret)
return ret;
- block_group->free_space += info->bytes;
- block_group->free_extents++;
+ ctl->free_space += info->bytes;
+ ctl->free_extents++;
return ret;
}
-static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
{
+ struct btrfs_block_group_cache *block_group = ctl->private;
u64 max_bytes;
u64 bitmap_bytes;
u64 extent_bytes;
u64 size = block_group->key.offset;
+ u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+ int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+
+ BUG_ON(ctl->total_bitmaps > max_bitmaps);
/*
* The goal is to keep the total amount of memory used per 1gb of space
@@ -1041,10 +1170,10 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
* sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
* we add more bitmaps.
*/
- bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+ bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
if (bitmap_bytes >= max_bytes) {
- block_group->extents_thresh = 0;
+ ctl->extents_thresh = 0;
return;
}
@@ -1055,47 +1184,43 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
extent_bytes = max_bytes - bitmap_bytes;
extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
- block_group->extents_thresh =
+ ctl->extents_thresh =
div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
}
-static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes)
{
- unsigned long start, end;
- unsigned long i;
+ unsigned long start, count;
- start = offset_to_bit(info->offset, block_group->sectorsize, offset);
- end = start + bytes_to_bits(bytes, block_group->sectorsize);
- BUG_ON(end > BITS_PER_BITMAP);
+ start = offset_to_bit(info->offset, ctl->unit, offset);
+ count = bytes_to_bits(bytes, ctl->unit);
+ BUG_ON(start + count > BITS_PER_BITMAP);
- for (i = start; i < end; i++)
- clear_bit(i, info->bitmap);
+ bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
- block_group->free_space -= bytes;
+ ctl->free_space -= bytes;
}
-static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset,
u64 bytes)
{
- unsigned long start, end;
- unsigned long i;
+ unsigned long start, count;
- start = offset_to_bit(info->offset, block_group->sectorsize, offset);
- end = start + bytes_to_bits(bytes, block_group->sectorsize);
- BUG_ON(end > BITS_PER_BITMAP);
+ start = offset_to_bit(info->offset, ctl->unit, offset);
+ count = bytes_to_bits(bytes, ctl->unit);
+ BUG_ON(start + count > BITS_PER_BITMAP);
- for (i = start; i < end; i++)
- set_bit(i, info->bitmap);
+ bitmap_set(info->bitmap, start, count);
info->bytes += bytes;
- block_group->free_space += bytes;
+ ctl->free_space += bytes;
}
-static int search_bitmap(struct btrfs_block_group_cache *block_group,
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
u64 *bytes)
{
@@ -1103,9 +1228,9 @@ static int search_bitmap(struct btrfs_block_group_cache *block_group,
unsigned long bits, i;
unsigned long next_zero;
- i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+ i = offset_to_bit(bitmap_info->offset, ctl->unit,
max_t(u64, *offset, bitmap_info->offset));
- bits = bytes_to_bits(*bytes, block_group->sectorsize);
+ bits = bytes_to_bits(*bytes, ctl->unit);
for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
i < BITS_PER_BITMAP;
@@ -1120,29 +1245,25 @@ static int search_bitmap(struct btrfs_block_group_cache *block_group,
}
if (found_bits) {
- *offset = (u64)(i * block_group->sectorsize) +
- bitmap_info->offset;
- *bytes = (u64)(found_bits) * block_group->sectorsize;
+ *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
+ *bytes = (u64)(found_bits) * ctl->unit;
return 0;
}
return -1;
}
-static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
- *block_group, u64 *offset,
- u64 *bytes, int debug)
+static struct btrfs_free_space *
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
{
struct btrfs_free_space *entry;
struct rb_node *node;
int ret;
- if (!block_group->free_space_offset.rb_node)
+ if (!ctl->free_space_offset.rb_node)
return NULL;
- entry = tree_search_offset(block_group,
- offset_to_bitmap(block_group, *offset),
- 0, 1);
+ entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
if (!entry)
return NULL;
@@ -1152,7 +1273,7 @@ static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
continue;
if (entry->bitmap) {
- ret = search_bitmap(block_group, entry, offset, bytes);
+ ret = search_bitmap(ctl, entry, offset, bytes);
if (!ret)
return entry;
continue;
@@ -1166,33 +1287,28 @@ static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
return NULL;
}
-static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, u64 offset)
{
- u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
- int max_bitmaps = (int)div64_u64(block_group->key.offset +
- bytes_per_bg - 1, bytes_per_bg);
- BUG_ON(block_group->total_bitmaps >= max_bitmaps);
-
- info->offset = offset_to_bitmap(block_group, offset);
+ info->offset = offset_to_bitmap(ctl, offset);
info->bytes = 0;
- link_free_space(block_group, info);
- block_group->total_bitmaps++;
+ link_free_space(ctl, info);
+ ctl->total_bitmaps++;
- recalculate_thresholds(block_group);
+ ctl->op->recalc_thresholds(ctl);
}
-static void free_bitmap(struct btrfs_block_group_cache *block_group,
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info)
{
- unlink_free_space(block_group, bitmap_info);
+ unlink_free_space(ctl, bitmap_info);
kfree(bitmap_info->bitmap);
- kfree(bitmap_info);
- block_group->total_bitmaps--;
- recalculate_thresholds(block_group);
+ kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
+ ctl->total_bitmaps--;
+ ctl->op->recalc_thresholds(ctl);
}
-static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info,
u64 *offset, u64 *bytes)
{
@@ -1201,8 +1317,7 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro
int ret;
again:
- end = bitmap_info->offset +
- (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+ end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
/*
* XXX - this can go away after a few releases.
@@ -1217,24 +1332,22 @@ again:
search_start = *offset;
search_bytes = *bytes;
search_bytes = min(search_bytes, end - search_start + 1);
- ret = search_bitmap(block_group, bitmap_info, &search_start,
- &search_bytes);
+ ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
BUG_ON(ret < 0 || search_start != *offset);
if (*offset > bitmap_info->offset && *offset + *bytes > end) {
- bitmap_clear_bits(block_group, bitmap_info, *offset,
- end - *offset + 1);
+ bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
*bytes -= end - *offset + 1;
*offset = end + 1;
} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
- bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+ bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
*bytes = 0;
}
if (*bytes) {
struct rb_node *next = rb_next(&bitmap_info->offset_index);
if (!bitmap_info->bytes)
- free_bitmap(block_group, bitmap_info);
+ free_bitmap(ctl, bitmap_info);
/*
* no entry after this bitmap, but we still have bytes to
@@ -1261,33 +1374,42 @@ again:
*/
search_start = *offset;
search_bytes = *bytes;
- ret = search_bitmap(block_group, bitmap_info, &search_start,
+ ret = search_bitmap(ctl, bitmap_info, &search_start,
&search_bytes);
if (ret < 0 || search_start != *offset)
return -EAGAIN;
goto again;
} else if (!bitmap_info->bytes)
- free_bitmap(block_group, bitmap_info);
+ free_bitmap(ctl, bitmap_info);
return 0;
}
-static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
- struct btrfs_free_space *info)
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
{
- struct btrfs_free_space *bitmap_info;
- int added = 0;
- u64 bytes, offset, end;
- int ret;
+ struct btrfs_block_group_cache *block_group = ctl->private;
/*
* If we are below the extents threshold then we can add this as an
* extent, and don't have to deal with the bitmap
*/
- if (block_group->free_extents < block_group->extents_thresh &&
- info->bytes > block_group->sectorsize * 4)
- return 0;
+ if (ctl->free_extents < ctl->extents_thresh) {
+ /*
+ * If this block group has some small extents we don't want to
+ * use up all of our free slots in the cache with them, we want
+ * to reserve them to larger extents, however if we have plent
+ * of cache left then go ahead an dadd them, no sense in adding
+ * the overhead of a bitmap if we don't have to.
+ */
+ if (info->bytes <= block_group->sectorsize * 4) {
+ if (ctl->free_extents * 2 <= ctl->extents_thresh)
+ return false;
+ } else {
+ return false;
+ }
+ }
/*
* some block groups are so tiny they can't be enveloped by a bitmap, so
@@ -1295,31 +1417,42 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
*/
if (BITS_PER_BITMAP * block_group->sectorsize >
block_group->key.offset)
- return 0;
+ return false;
+
+ return true;
+}
+
+static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ struct btrfs_free_space *bitmap_info;
+ int added = 0;
+ u64 bytes, offset, end;
+ int ret;
bytes = info->bytes;
offset = info->offset;
+ if (!ctl->op->use_bitmap(ctl, info))
+ return 0;
+
again:
- bitmap_info = tree_search_offset(block_group,
- offset_to_bitmap(block_group, offset),
+ bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!bitmap_info) {
BUG_ON(added);
goto new_bitmap;
}
- end = bitmap_info->offset +
- (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+ end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
if (offset >= bitmap_info->offset && offset + bytes > end) {
- bitmap_set_bits(block_group, bitmap_info, offset,
- end - offset);
+ bitmap_set_bits(ctl, bitmap_info, offset, end - offset);
bytes -= end - offset;
offset = end;
added = 0;
} else if (offset >= bitmap_info->offset && offset + bytes <= end) {
- bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+ bitmap_set_bits(ctl, bitmap_info, offset, bytes);
bytes = 0;
} else {
BUG();
@@ -1333,19 +1466,19 @@ again:
new_bitmap:
if (info && info->bitmap) {
- add_new_bitmap(block_group, info, offset);
+ add_new_bitmap(ctl, info, offset);
added = 1;
info = NULL;
goto again;
} else {
- spin_unlock(&block_group->tree_lock);
+ spin_unlock(&ctl->tree_lock);
/* no pre-allocated info, allocate a new one */
if (!info) {
- info = kzalloc(sizeof(struct btrfs_free_space),
- GFP_NOFS);
+ info = kmem_cache_zalloc(btrfs_free_space_cachep,
+ GFP_NOFS);
if (!info) {
- spin_lock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
ret = -ENOMEM;
goto out;
}
@@ -1353,7 +1486,7 @@ new_bitmap:
/* allocate the bitmap */
info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
- spin_lock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
if (!info->bitmap) {
ret = -ENOMEM;
goto out;
@@ -1365,13 +1498,13 @@ out:
if (info) {
if (info->bitmap)
kfree(info->bitmap);
- kfree(info);
+ kmem_cache_free(btrfs_free_space_cachep, info);
}
return ret;
}
-bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
+static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
struct btrfs_free_space *left_info;
@@ -1385,54 +1518,54 @@ bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
* are adding, if there is remove that struct and add a new one to
* cover the entire range
*/
- right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
+ right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
if (right_info && rb_prev(&right_info->offset_index))
left_info = rb_entry(rb_prev(&right_info->offset_index),
struct btrfs_free_space, offset_index);
else
- left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+ left_info = tree_search_offset(ctl, offset - 1, 0, 0);
if (right_info && !right_info->bitmap) {
if (update_stat)
- unlink_free_space(block_group, right_info);
+ unlink_free_space(ctl, right_info);
else
- __unlink_free_space(block_group, right_info);
+ __unlink_free_space(ctl, right_info);
info->bytes += right_info->bytes;
- kfree(right_info);
+ kmem_cache_free(btrfs_free_space_cachep, right_info);
merged = true;
}
if (left_info && !left_info->bitmap &&
left_info->offset + left_info->bytes == offset) {
if (update_stat)
- unlink_free_space(block_group, left_info);
+ unlink_free_space(ctl, left_info);
else
- __unlink_free_space(block_group, left_info);
+ __unlink_free_space(ctl, left_info);
info->offset = left_info->offset;
info->bytes += left_info->bytes;
- kfree(left_info);
+ kmem_cache_free(btrfs_free_space_cachep, left_info);
merged = true;
}
return merged;
}
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
- u64 offset, u64 bytes)
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+ u64 offset, u64 bytes)
{
struct btrfs_free_space *info;
int ret = 0;
- info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
+ info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
if (!info)
return -ENOMEM;
info->offset = offset;
info->bytes = bytes;
- spin_lock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
- if (try_merge_free_space(block_group, info, true))
+ if (try_merge_free_space(ctl, info, true))
goto link;
/*
@@ -1440,7 +1573,7 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
* extent then we know we're going to have to allocate a new extent, so
* before we do that see if we need to drop this into a bitmap
*/
- ret = insert_into_bitmap(block_group, info);
+ ret = insert_into_bitmap(ctl, info);
if (ret < 0) {
goto out;
} else if (ret) {
@@ -1448,11 +1581,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
goto out;
}
link:
- ret = link_free_space(block_group, info);
+ ret = link_free_space(ctl, info);
if (ret)
- kfree(info);
+ kmem_cache_free(btrfs_free_space_cachep, info);
out:
- spin_unlock(&block_group->tree_lock);
+ spin_unlock(&ctl->tree_lock);
if (ret) {
printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
@@ -1465,21 +1598,21 @@ out:
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes)
{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
struct btrfs_free_space *next_info = NULL;
int ret = 0;
- spin_lock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
again:
- info = tree_search_offset(block_group, offset, 0, 0);
+ info = tree_search_offset(ctl, offset, 0, 0);
if (!info) {
/*
* oops didn't find an extent that matched the space we wanted
* to remove, look for a bitmap instead
*/
- info = tree_search_offset(block_group,
- offset_to_bitmap(block_group, offset),
+ info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1, 0);
if (!info) {
WARN_ON(1);
@@ -1494,8 +1627,8 @@ again:
offset_index);
if (next_info->bitmap)
- end = next_info->offset + BITS_PER_BITMAP *
- block_group->sectorsize - 1;
+ end = next_info->offset +
+ BITS_PER_BITMAP * ctl->unit - 1;
else
end = next_info->offset + next_info->bytes;
@@ -1515,20 +1648,20 @@ again:
}
if (info->bytes == bytes) {
- unlink_free_space(block_group, info);
+ unlink_free_space(ctl, info);
if (info->bitmap) {
kfree(info->bitmap);
- block_group->total_bitmaps--;
+ ctl->total_bitmaps--;
}
- kfree(info);
+ kmem_cache_free(btrfs_free_space_cachep, info);
goto out_lock;
}
if (!info->bitmap && info->offset == offset) {
- unlink_free_space(block_group, info);
+ unlink_free_space(ctl, info);
info->offset += bytes;
info->bytes -= bytes;
- link_free_space(block_group, info);
+ link_free_space(ctl, info);
goto out_lock;
}
@@ -1542,13 +1675,13 @@ again:
* first unlink the old info and then
* insert it again after the hole we're creating
*/
- unlink_free_space(block_group, info);
+ unlink_free_space(ctl, info);
if (offset + bytes < info->offset + info->bytes) {
u64 old_end = info->offset + info->bytes;
info->offset = offset + bytes;
info->bytes = old_end - info->offset;
- ret = link_free_space(block_group, info);
+ ret = link_free_space(ctl, info);
WARN_ON(ret);
if (ret)
goto out_lock;
@@ -1556,9 +1689,9 @@ again:
/* the hole we're creating ends at the end
* of the info struct, just free the info
*/
- kfree(info);
+ kmem_cache_free(btrfs_free_space_cachep, info);
}
- spin_unlock(&block_group->tree_lock);
+ spin_unlock(&ctl->tree_lock);
/* step two, insert a new info struct to cover
* anything before the hole
@@ -1569,12 +1702,12 @@ again:
goto out;
}
- ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+ ret = remove_from_bitmap(ctl, info, &offset, &bytes);
if (ret == -EAGAIN)
goto again;
BUG_ON(ret);
out_lock:
- spin_unlock(&block_group->tree_lock);
+ spin_unlock(&ctl->tree_lock);
out:
return ret;
}
@@ -1582,11 +1715,12 @@ out:
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
u64 bytes)
{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *info;
struct rb_node *n;
int count = 0;
- for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+ for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes)
count++;
@@ -1601,19 +1735,28 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
"\n", count);
}
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+static struct btrfs_free_space_op free_space_op = {
+ .recalc_thresholds = recalculate_thresholds,
+ .use_bitmap = use_bitmap,
+};
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
{
- struct btrfs_free_space *info;
- struct rb_node *n;
- u64 ret = 0;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- for (n = rb_first(&block_group->free_space_offset); n;
- n = rb_next(n)) {
- info = rb_entry(n, struct btrfs_free_space, offset_index);
- ret += info->bytes;
- }
+ spin_lock_init(&ctl->tree_lock);
+ ctl->unit = block_group->sectorsize;
+ ctl->start = block_group->key.objectid;
+ ctl->private = block_group;
+ ctl->op = &free_space_op;
- return ret;
+ /*
+ * we only want to have 32k of ram per block group for keeping
+ * track of free space, and if we pass 1/2 of that we want to
+ * start converting things over to using bitmaps
+ */
+ ctl->extents_thresh = ((1024 * 32) / 2) /
+ sizeof(struct btrfs_free_space);
}
/*
@@ -1627,32 +1770,31 @@ __btrfs_return_cluster_to_free_space(
struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster)
{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry;
struct rb_node *node;
- bool bitmap;
spin_lock(&cluster->lock);
if (cluster->block_group != block_group)
goto out;
- bitmap = cluster->points_to_bitmap;
cluster->block_group = NULL;
cluster->window_start = 0;
list_del_init(&cluster->block_group_list);
- cluster->points_to_bitmap = false;
-
- if (bitmap)
- goto out;
node = rb_first(&cluster->root);
while (node) {
+ bool bitmap;
+
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
rb_erase(&entry->offset_index, &cluster->root);
- BUG_ON(entry->bitmap);
- try_merge_free_space(block_group, entry, false);
- tree_insert_offset(&block_group->free_space_offset,
- entry->offset, &entry->offset_index, 0);
+
+ bitmap = (entry->bitmap != NULL);
+ if (!bitmap)
+ try_merge_free_space(ctl, entry, false);
+ tree_insert_offset(&ctl->free_space_offset,
+ entry->offset, &entry->offset_index, bitmap);
}
cluster->root = RB_ROOT;
@@ -1662,14 +1804,38 @@ out:
return 0;
}
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
{
struct btrfs_free_space *info;
struct rb_node *node;
+
+ while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
+ unlink_free_space(ctl, info);
+ kfree(info->bitmap);
+ kmem_cache_free(btrfs_free_space_cachep, info);
+ if (need_resched()) {
+ spin_unlock(&ctl->tree_lock);
+ cond_resched();
+ spin_lock(&ctl->tree_lock);
+ }
+ }
+}
+
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+ spin_lock(&ctl->tree_lock);
+ __btrfs_remove_free_space_cache_locked(ctl);
+ spin_unlock(&ctl->tree_lock);
+}
+
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_cluster *cluster;
struct list_head *head;
- spin_lock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
while ((head = block_group->cluster_list.next) !=
&block_group->cluster_list) {
cluster = list_entry(head, struct btrfs_free_cluster,
@@ -1678,57 +1844,46 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
WARN_ON(cluster->block_group != block_group);
__btrfs_return_cluster_to_free_space(block_group, cluster);
if (need_resched()) {
- spin_unlock(&block_group->tree_lock);
- cond_resched();
- spin_lock(&block_group->tree_lock);
- }
- }
-
- while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
- info = rb_entry(node, struct btrfs_free_space, offset_index);
- unlink_free_space(block_group, info);
- if (info->bitmap)
- kfree(info->bitmap);
- kfree(info);
- if (need_resched()) {
- spin_unlock(&block_group->tree_lock);
+ spin_unlock(&ctl->tree_lock);
cond_resched();
- spin_lock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
}
}
+ __btrfs_remove_free_space_cache_locked(ctl);
+ spin_unlock(&ctl->tree_lock);
- spin_unlock(&block_group->tree_lock);
}
u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes, u64 empty_size)
{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry = NULL;
u64 bytes_search = bytes + empty_size;
u64 ret = 0;
- spin_lock(&block_group->tree_lock);
- entry = find_free_space(block_group, &offset, &bytes_search, 0);
+ spin_lock(&ctl->tree_lock);
+ entry = find_free_space(ctl, &offset, &bytes_search);
if (!entry)
goto out;
ret = offset;
if (entry->bitmap) {
- bitmap_clear_bits(block_group, entry, offset, bytes);
+ bitmap_clear_bits(ctl, entry, offset, bytes);
if (!entry->bytes)
- free_bitmap(block_group, entry);
+ free_bitmap(ctl, entry);
} else {
- unlink_free_space(block_group, entry);
+ unlink_free_space(ctl, entry);
entry->offset += bytes;
entry->bytes -= bytes;
if (!entry->bytes)
- kfree(entry);
+ kmem_cache_free(btrfs_free_space_cachep, entry);
else
- link_free_space(block_group, entry);
+ link_free_space(ctl, entry);
}
out:
- spin_unlock(&block_group->tree_lock);
+ spin_unlock(&ctl->tree_lock);
return ret;
}
@@ -1745,6 +1900,7 @@ int btrfs_return_cluster_to_free_space(
struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster)
{
+ struct btrfs_free_space_ctl *ctl;
int ret;
/* first, get a safe pointer to the block group */
@@ -1763,10 +1919,12 @@ int btrfs_return_cluster_to_free_space(
atomic_inc(&block_group->count);
spin_unlock(&cluster->lock);
+ ctl = block_group->free_space_ctl;
+
/* now return any extents the cluster had on it */
- spin_lock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
- spin_unlock(&block_group->tree_lock);
+ spin_unlock(&ctl->tree_lock);
/* finally drop our ref */
btrfs_put_block_group(block_group);
@@ -1775,50 +1933,24 @@ int btrfs_return_cluster_to_free_space(
static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
+ struct btrfs_free_space *entry,
u64 bytes, u64 min_start)
{
- struct btrfs_free_space *entry;
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
int err;
u64 search_start = cluster->window_start;
u64 search_bytes = bytes;
u64 ret = 0;
- spin_lock(&block_group->tree_lock);
- spin_lock(&cluster->lock);
-
- if (!cluster->points_to_bitmap)
- goto out;
-
- if (cluster->block_group != block_group)
- goto out;
-
- /*
- * search_start is the beginning of the bitmap, but at some point it may
- * be a good idea to point to the actual start of the free area in the
- * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
- * to 1 to make sure we get the bitmap entry
- */
- entry = tree_search_offset(block_group,
- offset_to_bitmap(block_group, search_start),
- 1, 0);
- if (!entry || !entry->bitmap)
- goto out;
-
search_start = min_start;
search_bytes = bytes;
- err = search_bitmap(block_group, entry, &search_start,
- &search_bytes);
+ err = search_bitmap(ctl, entry, &search_start, &search_bytes);
if (err)
- goto out;
+ return 0;
ret = search_start;
- bitmap_clear_bits(block_group, entry, ret, bytes);
- if (entry->bytes == 0)
- free_bitmap(block_group, entry);
-out:
- spin_unlock(&cluster->lock);
- spin_unlock(&block_group->tree_lock);
+ bitmap_clear_bits(ctl, entry, ret, bytes);
return ret;
}
@@ -1832,14 +1964,11 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster, u64 bytes,
u64 min_start)
{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct btrfs_free_space *entry = NULL;
struct rb_node *node;
u64 ret = 0;
- if (cluster->points_to_bitmap)
- return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
- min_start);
-
spin_lock(&cluster->lock);
if (bytes > cluster->max_size)
goto out;
@@ -1852,11 +1981,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
goto out;
entry = rb_entry(node, struct btrfs_free_space, offset_index);
-
while(1) {
- if (entry->bytes < bytes || entry->offset < min_start) {
- struct rb_node *node;
-
+ if (entry->bytes < bytes ||
+ (!entry->bitmap && entry->offset < min_start)) {
node = rb_next(&entry->offset_index);
if (!node)
break;
@@ -1864,10 +1991,26 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
offset_index);
continue;
}
- ret = entry->offset;
- entry->offset += bytes;
- entry->bytes -= bytes;
+ if (entry->bitmap) {
+ ret = btrfs_alloc_from_bitmap(block_group,
+ cluster, entry, bytes,
+ min_start);
+ if (ret == 0) {
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ break;
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ }
+ } else {
+
+ ret = entry->offset;
+
+ entry->offset += bytes;
+ entry->bytes -= bytes;
+ }
if (entry->bytes == 0)
rb_erase(&entry->offset_index, &cluster->root);
@@ -1879,15 +2022,20 @@ out:
if (!ret)
return 0;
- spin_lock(&block_group->tree_lock);
+ spin_lock(&ctl->tree_lock);
- block_group->free_space -= bytes;
+ ctl->free_space -= bytes;
if (entry->bytes == 0) {
- block_group->free_extents--;
- kfree(entry);
+ ctl->free_extents--;
+ if (entry->bitmap) {
+ kfree(entry->bitmap);
+ ctl->total_bitmaps--;
+ ctl->op->recalc_thresholds(ctl);
+ }
+ kmem_cache_free(btrfs_free_space_cachep, entry);
}
- spin_unlock(&block_group->tree_lock);
+ spin_unlock(&ctl->tree_lock);
return ret;
}
@@ -1897,6 +2045,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster,
u64 offset, u64 bytes, u64 min_bytes)
{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
unsigned long next_zero;
unsigned long i;
unsigned long search_bits;
@@ -1904,12 +2053,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
unsigned long found_bits;
unsigned long start = 0;
unsigned long total_found = 0;
+ int ret;
bool found = false;
i = offset_to_bit(entry->offset, block_group->sectorsize,
max_t(u64, offset, entry->offset));
- search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
- total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+ search_bits = bytes_to_bits(bytes, block_group->sectorsize);
+ total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
again:
found_bits = 0;
@@ -1926,7 +2076,7 @@ again:
}
if (!found_bits)
- return -1;
+ return -ENOSPC;
if (!found) {
start = i;
@@ -1950,192 +2100,212 @@ again:
cluster->window_start = start * block_group->sectorsize +
entry->offset;
- cluster->points_to_bitmap = true;
+ rb_erase(&entry->offset_index, &ctl->free_space_offset);
+ ret = tree_insert_offset(&cluster->root, entry->offset,
+ &entry->offset_index, 1);
+ BUG_ON(ret);
return 0;
}
/*
- * here we try to find a cluster of blocks in a block group. The goal
- * is to find at least bytes free and up to empty_size + bytes free.
- * We might not find them all in one contiguous area.
- *
- * returns zero and sets up cluster if things worked out, otherwise
- * it returns -enospc
+ * This searches the block group for just extents to fill the cluster with.
*/
-int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_free_cluster *cluster,
- u64 offset, u64 bytes, u64 empty_size)
+static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 min_bytes)
{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *first = NULL;
struct btrfs_free_space *entry = NULL;
+ struct btrfs_free_space *prev = NULL;
+ struct btrfs_free_space *last;
struct rb_node *node;
- struct btrfs_free_space *next;
- struct btrfs_free_space *last = NULL;
- u64 min_bytes;
u64 window_start;
u64 window_free;
- u64 max_extent = 0;
- bool found_bitmap = false;
- int ret;
-
- /* for metadata, allow allocates with more holes */
- if (btrfs_test_opt(root, SSD_SPREAD)) {
- min_bytes = bytes + empty_size;
- } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
- /*
- * we want to do larger allocations when we are
- * flushing out the delayed refs, it helps prevent
- * making more work as we go along.
- */
- if (trans->transaction->delayed_refs.flushing)
- min_bytes = max(bytes, (bytes + empty_size) >> 1);
- else
- min_bytes = max(bytes, (bytes + empty_size) >> 4);
- } else
- min_bytes = max(bytes, (bytes + empty_size) >> 2);
+ u64 max_extent;
+ u64 max_gap = 128 * 1024;
- spin_lock(&block_group->tree_lock);
- spin_lock(&cluster->lock);
-
- /* someone already found a cluster, hooray */
- if (cluster->block_group) {
- ret = 0;
- goto out;
- }
-again:
- entry = tree_search_offset(block_group, offset, found_bitmap, 1);
- if (!entry) {
- ret = -ENOSPC;
- goto out;
- }
+ entry = tree_search_offset(ctl, offset, 0, 1);
+ if (!entry)
+ return -ENOSPC;
/*
- * If found_bitmap is true, we exhausted our search for extent entries,
- * and we just want to search all of the bitmaps that we can find, and
- * ignore any extent entries we find.
+ * We don't want bitmaps, so just move along until we find a normal
+ * extent entry.
*/
- while (entry->bitmap || found_bitmap ||
- (!entry->bitmap && entry->bytes < min_bytes)) {
- struct rb_node *node = rb_next(&entry->offset_index);
-
- if (entry->bitmap && entry->bytes > bytes + empty_size) {
- ret = btrfs_bitmap_cluster(block_group, entry, cluster,
- offset, bytes + empty_size,
- min_bytes);
- if (!ret)
- goto got_it;
- }
-
- if (!node) {
- ret = -ENOSPC;
- goto out;
- }
+ while (entry->bitmap) {
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ return -ENOSPC;
entry = rb_entry(node, struct btrfs_free_space, offset_index);
}
- /*
- * We already searched all the extent entries from the passed in offset
- * to the end and didn't find enough space for the cluster, and we also
- * didn't find any bitmaps that met our criteria, just go ahead and exit
- */
- if (found_bitmap) {
- ret = -ENOSPC;
- goto out;
- }
-
- cluster->points_to_bitmap = false;
window_start = entry->offset;
window_free = entry->bytes;
- last = entry;
max_extent = entry->bytes;
+ first = entry;
+ last = entry;
+ prev = entry;
- while (1) {
- /* out window is just right, lets fill it */
- if (window_free >= bytes + empty_size)
- break;
-
- node = rb_next(&last->offset_index);
- if (!node) {
- if (found_bitmap)
- goto again;
- ret = -ENOSPC;
- goto out;
- }
- next = rb_entry(node, struct btrfs_free_space, offset_index);
+ while (window_free <= min_bytes) {
+ node = rb_next(&entry->offset_index);
+ if (!node)
+ return -ENOSPC;
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
- /*
- * we found a bitmap, so if this search doesn't result in a
- * cluster, we know to go and search again for the bitmaps and
- * start looking for space there
- */
- if (next->bitmap) {
- if (!found_bitmap)
- offset = next->offset;
- found_bitmap = true;
- last = next;
+ if (entry->bitmap)
continue;
- }
-
/*
* we haven't filled the empty size and the window is
* very large. reset and try again
*/
- if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
- next->offset - window_start > (bytes + empty_size) * 2) {
- entry = next;
+ if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
+ entry->offset - window_start > (min_bytes * 2)) {
+ first = entry;
window_start = entry->offset;
window_free = entry->bytes;
last = entry;
max_extent = entry->bytes;
} else {
- last = next;
- window_free += next->bytes;
+ last = entry;
+ window_free += entry->bytes;
if (entry->bytes > max_extent)
max_extent = entry->bytes;
}
+ prev = entry;
}
- cluster->window_start = entry->offset;
+ cluster->window_start = first->offset;
+
+ node = &first->offset_index;
/*
* now we've found our entries, pull them out of the free space
* cache and put them into the cluster rbtree
- *
- * The cluster includes an rbtree, but only uses the offset index
- * of each free space cache entry.
*/
- while (1) {
+ do {
+ int ret;
+
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
- if (entry->bitmap && node) {
- entry = rb_entry(node, struct btrfs_free_space,
- offset_index);
+ if (entry->bitmap)
continue;
- } else if (entry->bitmap && !node) {
- break;
- }
- rb_erase(&entry->offset_index, &block_group->free_space_offset);
+ rb_erase(&entry->offset_index, &ctl->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
&entry->offset_index, 0);
BUG_ON(ret);
+ } while (node && entry != last);
- if (!node || entry == last)
- break;
+ cluster->max_size = max_extent;
+
+ return 0;
+}
+
+/*
+ * This specifically looks for bitmaps that may work in the cluster, we assume
+ * that we have already failed to find extents that will work.
+ */
+static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 min_bytes)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ int ret = -ENOSPC;
+
+ if (ctl->total_bitmaps == 0)
+ return -ENOSPC;
+ entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
+ if (!entry)
+ return -ENOSPC;
+
+ node = &entry->offset_index;
+ do {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ node = rb_next(&entry->offset_index);
+ if (!entry->bitmap)
+ continue;
+ if (entry->bytes < min_bytes)
+ continue;
+ ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+ bytes, min_bytes);
+ } while (ret && node);
+
+ return ret;
+}
+
+/*
+ * here we try to find a cluster of blocks in a block group. The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 empty_size)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ u64 min_bytes;
+ int ret;
+
+ /* for metadata, allow allocates with more holes */
+ if (btrfs_test_opt(root, SSD_SPREAD)) {
+ min_bytes = bytes + empty_size;
+ } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ /*
+ * we want to do larger allocations when we are
+ * flushing out the delayed refs, it helps prevent
+ * making more work as we go along.
+ */
+ if (trans->transaction->delayed_refs.flushing)
+ min_bytes = max(bytes, (bytes + empty_size) >> 1);
+ else
+ min_bytes = max(bytes, (bytes + empty_size) >> 4);
+ } else
+ min_bytes = max(bytes, (bytes + empty_size) >> 2);
+
+ spin_lock(&ctl->tree_lock);
+
+ /*
+ * If we know we don't have enough space to make a cluster don't even
+ * bother doing all the work to try and find one.
+ */
+ if (ctl->free_space < min_bytes) {
+ spin_unlock(&ctl->tree_lock);
+ return -ENOSPC;
}
- cluster->max_size = max_extent;
-got_it:
- ret = 0;
- atomic_inc(&block_group->count);
- list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
- cluster->block_group = block_group;
+ spin_lock(&cluster->lock);
+
+ /* someone already found a cluster, hooray */
+ if (cluster->block_group) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes,
+ min_bytes);
+ if (ret)
+ ret = setup_cluster_bitmap(block_group, cluster, offset,
+ bytes, min_bytes);
+
+ if (!ret) {
+ atomic_inc(&block_group->count);
+ list_add_tail(&cluster->block_group_list,
+ &block_group->cluster_list);
+ cluster->block_group = block_group;
+ }
out:
spin_unlock(&cluster->lock);
- spin_unlock(&block_group->tree_lock);
+ spin_unlock(&ctl->tree_lock);
return ret;
}
@@ -2149,8 +2319,239 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
spin_lock_init(&cluster->refill_lock);
cluster->root = RB_ROOT;
cluster->max_size = 0;
- cluster->points_to_bitmap = false;
INIT_LIST_HEAD(&cluster->block_group_list);
cluster->block_group = NULL;
}
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+ struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+ struct btrfs_free_space *entry = NULL;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ u64 bytes = 0;
+ u64 actually_trimmed;
+ int ret = 0;
+
+ *trimmed = 0;
+
+ while (start < end) {
+ spin_lock(&ctl->tree_lock);
+
+ if (ctl->free_space < minlen) {
+ spin_unlock(&ctl->tree_lock);
+ break;
+ }
+
+ entry = tree_search_offset(ctl, start, 0, 1);
+ if (!entry)
+ entry = tree_search_offset(ctl,
+ offset_to_bitmap(ctl, start),
+ 1, 1);
+
+ if (!entry || entry->offset >= end) {
+ spin_unlock(&ctl->tree_lock);
+ break;
+ }
+
+ if (entry->bitmap) {
+ ret = search_bitmap(ctl, entry, &start, &bytes);
+ if (!ret) {
+ if (start >= end) {
+ spin_unlock(&ctl->tree_lock);
+ break;
+ }
+ bytes = min(bytes, end - start);
+ bitmap_clear_bits(ctl, entry, start, bytes);
+ if (entry->bytes == 0)
+ free_bitmap(ctl, entry);
+ } else {
+ start = entry->offset + BITS_PER_BITMAP *
+ block_group->sectorsize;
+ spin_unlock(&ctl->tree_lock);
+ ret = 0;
+ continue;
+ }
+ } else {
+ start = entry->offset;
+ bytes = min(entry->bytes, end - start);
+ unlink_free_space(ctl, entry);
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ }
+
+ spin_unlock(&ctl->tree_lock);
+
+ if (bytes >= minlen) {
+ int update_ret;
+ update_ret = btrfs_update_reserved_bytes(block_group,
+ bytes, 1, 1);
+
+ ret = btrfs_error_discard_extent(fs_info->extent_root,
+ start,
+ bytes,
+ &actually_trimmed);
+
+ btrfs_add_free_space(block_group, start, bytes);
+ if (!update_ret)
+ btrfs_update_reserved_bytes(block_group,
+ bytes, 0, 1);
+
+ if (ret)
+ break;
+ *trimmed += actually_trimmed;
+ }
+ start += bytes;
+ bytes = 0;
+
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return ret;
+}
+
+/*
+ * Find the left-most item in the cache tree, and then return the
+ * smallest inode number in the item.
+ *
+ * Note: the returned inode number may not be the smallest one in
+ * the tree, if the left-most item is a bitmap.
+ */
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
+{
+ struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
+ struct btrfs_free_space *entry = NULL;
+ u64 ino = 0;
+
+ spin_lock(&ctl->tree_lock);
+
+ if (RB_EMPTY_ROOT(&ctl->free_space_offset))
+ goto out;
+
+ entry = rb_entry(rb_first(&ctl->free_space_offset),
+ struct btrfs_free_space, offset_index);
+
+ if (!entry->bitmap) {
+ ino = entry->offset;
+
+ unlink_free_space(ctl, entry);
+ entry->offset++;
+ entry->bytes--;
+ if (!entry->bytes)
+ kmem_cache_free(btrfs_free_space_cachep, entry);
+ else
+ link_free_space(ctl, entry);
+ } else {
+ u64 offset = 0;
+ u64 count = 1;
+ int ret;
+
+ ret = search_bitmap(ctl, entry, &offset, &count);
+ BUG_ON(ret);
+
+ ino = offset;
+ bitmap_clear_bits(ctl, entry, offset, 1);
+ if (entry->bytes == 0)
+ free_bitmap(ctl, entry);
+ }
+out:
+ spin_unlock(&ctl->tree_lock);
+
+ return ino;
+}
+
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ struct inode *inode = NULL;
+
+ spin_lock(&root->cache_lock);
+ if (root->cache_inode)
+ inode = igrab(root->cache_inode);
+ spin_unlock(&root->cache_lock);
+ if (inode)
+ return inode;
+
+ inode = __lookup_free_space_inode(root, path, 0);
+ if (IS_ERR(inode))
+ return inode;
+
+ spin_lock(&root->cache_lock);
+ if (!root->fs_info->closing)
+ root->cache_inode = igrab(inode);
+ spin_unlock(&root->cache_lock);
+
+ return inode;
+}
+
+int create_free_ino_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path)
+{
+ return __create_free_space_inode(root, trans, path,
+ BTRFS_FREE_INO_OBJECTID, 0);
+}
+
+int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_path *path;
+ struct inode *inode;
+ int ret = 0;
+ u64 root_gen = btrfs_root_generation(&root->root_item);
+
+ /*
+ * If we're unmounting then just return, since this does a search on the
+ * normal root and not the commit root and we could deadlock.
+ */
+ smp_mb();
+ if (fs_info->closing)
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return 0;
+
+ inode = lookup_free_ino_inode(root, path);
+ if (IS_ERR(inode))
+ goto out;
+
+ if (root_gen != BTRFS_I(inode)->generation)
+ goto out_put;
+
+ ret = __load_free_space_cache(root, inode, ctl, path, 0);
+
+ if (ret < 0)
+ printk(KERN_ERR "btrfs: failed to load free ino cache for "
+ "root %llu\n", root->root_key.objectid);
+out_put:
+ iput(inode);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct inode *inode;
+ int ret;
+
+ inode = lookup_free_ino_inode(root, path);
+ if (IS_ERR(inode))
+ return 0;
+
+ ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+ if (ret < 0)
+ printk(KERN_ERR "btrfs: failed to write free ino cache "
+ "for root %llu\n", root->root_key.objectid);
+
+ iput(inode);
+ return ret;
+}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e49ca5c321b5..8f2613f779ed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,25 @@ struct btrfs_free_space {
struct list_head list;
};
+struct btrfs_free_space_ctl {
+ spinlock_t tree_lock;
+ struct rb_root free_space_offset;
+ u64 free_space;
+ int extents_thresh;
+ int free_extents;
+ int total_bitmaps;
+ int unit;
+ u64 start;
+ struct btrfs_free_space_op *op;
+ void *private;
+};
+
+struct btrfs_free_space_op {
+ void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
+ bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info);
+};
+
struct inode *lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_block_group_cache
*block_group, struct btrfs_path *path);
@@ -45,17 +64,38 @@ int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_path *path);
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
- u64 bytenr, u64 size);
+
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+ struct btrfs_path *path);
+int create_free_ino_inode(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path);
+int load_free_ino_cache(struct btrfs_fs_info *fs_info,
+ struct btrfs_root *root);
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path);
+
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+ u64 bytenr, u64 size);
+static inline int
+btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+ u64 bytenr, u64 size)
+{
+ return __btrfs_add_free_space(block_group->free_space_ctl,
+ bytenr, size);
+}
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
u64 bytenr, u64 size);
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
- *block_group);
+ *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes, u64 empty_size);
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
u64 bytes);
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_block_group_cache *block_group,
@@ -68,4 +108,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
int btrfs_return_cluster_to_free_space(
struct btrfs_block_group_cache *block_group,
struct btrfs_free_cluster *cluster);
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+ u64 *trimmed, u64 start, u64 end, u64 minlen);
#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 64f1150bb48d..baa74f3db691 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -130,7 +130,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
item_size - (ptr + sub_item_len - item_start));
ret = btrfs_truncate_item(trans, root, path,
item_size - sub_item_len, 1);
- BUG_ON(ret);
out:
btrfs_free_path(path);
return ret;
@@ -167,7 +166,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
ret = btrfs_extend_item(trans, root, path, ins_len);
- BUG_ON(ret);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c56eb5909172..3262cd17a12f 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -16,11 +16,446 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+
#include "ctree.h"
#include "disk-io.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
#include "transaction.h"
-int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+static int caching_kthread(void *data)
+{
+ struct btrfs_root *root = data;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ u64 last = (u64)-1;
+ int slot;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Since the commit root is read-only, we can safely skip locking. */
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ path->reada = 2;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.offset = 0;
+ key.type = BTRFS_INODE_ITEM_KEY;
+again:
+ /* need to make sure the commit_root doesn't disappear */
+ mutex_lock(&root->fs_commit_mutex);
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ while (1) {
+ smp_mb();
+ if (fs_info->closing)
+ goto out;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+
+ if (need_resched() ||
+ btrfs_transaction_in_commit(fs_info)) {
+ leaf = path->nodes[0];
+
+ if (btrfs_header_nritems(leaf) == 0) {
+ WARN_ON(1);
+ break;
+ }
+
+ /*
+ * Save the key so we can advances forward
+ * in the next search.
+ */
+ btrfs_item_key_to_cpu(leaf, &key, 0);
+ btrfs_release_path(path);
+ root->cache_progress = last;
+ mutex_unlock(&root->fs_commit_mutex);
+ schedule_timeout(1);
+ goto again;
+ } else
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ if (key.type != BTRFS_INODE_ITEM_KEY)
+ goto next;
+
+ if (key.objectid >= root->highest_objectid)
+ break;
+
+ if (last != (u64)-1 && last + 1 != key.objectid) {
+ __btrfs_add_free_space(ctl, last + 1,
+ key.objectid - last - 1);
+ wake_up(&root->cache_wait);
+ }
+
+ last = key.objectid;
+next:
+ path->slots[0]++;
+ }
+
+ if (last < root->highest_objectid - 1) {
+ __btrfs_add_free_space(ctl, last + 1,
+ root->highest_objectid - last - 1);
+ }
+
+ spin_lock(&root->cache_lock);
+ root->cached = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->cache_lock);
+
+ root->cache_progress = (u64)-1;
+ btrfs_unpin_free_ino(root);
+out:
+ wake_up(&root->cache_wait);
+ mutex_unlock(&root->fs_commit_mutex);
+
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+static void start_caching(struct btrfs_root *root)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct task_struct *tsk;
+ int ret;
+ u64 objectid;
+
+ spin_lock(&root->cache_lock);
+ if (root->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&root->cache_lock);
+ return;
+ }
+
+ root->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&root->cache_lock);
+
+ ret = load_free_ino_cache(root->fs_info, root);
+ if (ret == 1) {
+ spin_lock(&root->cache_lock);
+ root->cached = BTRFS_CACHE_FINISHED;
+ spin_unlock(&root->cache_lock);
+ return;
+ }
+
+ /*
+ * It can be quite time-consuming to fill the cache by searching
+ * through the extent tree, and this can keep ino allocation path
+ * waiting. Therefore at start we quickly find out the highest
+ * inode number and we know we can use inode numbers which fall in
+ * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
+ */
+ ret = btrfs_find_free_objectid(root, &objectid);
+ if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
+ __btrfs_add_free_space(ctl, objectid,
+ BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+ }
+
+ tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
+ root->root_key.objectid);
+ BUG_ON(IS_ERR(tsk));
+}
+
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
+{
+again:
+ *objectid = btrfs_find_ino_for_alloc(root);
+
+ if (*objectid != 0)
+ return 0;
+
+ start_caching(root);
+
+ wait_event(root->cache_wait,
+ root->cached == BTRFS_CACHE_FINISHED ||
+ root->free_ino_ctl->free_space > 0);
+
+ if (root->cached == BTRFS_CACHE_FINISHED &&
+ root->free_ino_ctl->free_space == 0)
+ return -ENOSPC;
+ else
+ goto again;
+}
+
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+again:
+ if (root->cached == BTRFS_CACHE_FINISHED) {
+ __btrfs_add_free_space(ctl, objectid, 1);
+ } else {
+ /*
+ * If we are in the process of caching free ino chunks,
+ * to avoid adding the same inode number to the free_ino
+ * tree twice due to cross transaction, we'll leave it
+ * in the pinned tree until a transaction is committed
+ * or the caching work is done.
+ */
+
+ mutex_lock(&root->fs_commit_mutex);
+ spin_lock(&root->cache_lock);
+ if (root->cached == BTRFS_CACHE_FINISHED) {
+ spin_unlock(&root->cache_lock);
+ mutex_unlock(&root->fs_commit_mutex);
+ goto again;
+ }
+ spin_unlock(&root->cache_lock);
+
+ start_caching(root);
+
+ if (objectid <= root->cache_progress ||
+ objectid > root->highest_objectid)
+ __btrfs_add_free_space(ctl, objectid, 1);
+ else
+ __btrfs_add_free_space(pinned, objectid, 1);
+
+ mutex_unlock(&root->fs_commit_mutex);
+ }
+}
+
+/*
+ * When a transaction is committed, we'll move those inode numbers which
+ * are smaller than root->cache_progress from pinned tree to free_ino tree,
+ * and others will just be dropped, because the commit root we were
+ * searching has changed.
+ *
+ * Must be called with root->fs_commit_mutex held
+ */
+void btrfs_unpin_free_ino(struct btrfs_root *root)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ u64 count;
+
+ while (1) {
+ n = rb_first(rbroot);
+ if (!n)
+ break;
+
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+ BUG_ON(info->bitmap);
+
+ if (info->offset > root->cache_progress)
+ goto free;
+ else if (info->offset + info->bytes > root->cache_progress)
+ count = root->cache_progress - info->offset + 1;
+ else
+ count = info->bytes;
+
+ __btrfs_add_free_space(ctl, info->offset, count);
+free:
+ rb_erase(&info->offset_index, rbroot);
+ kfree(info);
+ }
+}
+
+#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+
+/*
+ * The goal is to keep the memory used by the free_ino tree won't
+ * exceed the memory if we use bitmaps only.
+ */
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+ struct btrfs_free_space *info;
+ struct rb_node *n;
+ int max_ino;
+ int max_bitmaps;
+
+ n = rb_last(&ctl->free_space_offset);
+ if (!n) {
+ ctl->extents_thresh = INIT_THRESHOLD;
+ return;
+ }
+ info = rb_entry(n, struct btrfs_free_space, offset_index);
+
+ /*
+ * Find the maximum inode number in the filesystem. Note we
+ * ignore the fact that this can be a bitmap, because we are
+ * not doing precise calculation.
+ */
+ max_ino = info->bytes - 1;
+
+ max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
+ if (max_bitmaps <= ctl->total_bitmaps) {
+ ctl->extents_thresh = 0;
+ return;
+ }
+
+ ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
+ PAGE_CACHE_SIZE / sizeof(*info);
+}
+
+/*
+ * We don't fall back to bitmap, if we are below the extents threshold
+ * or this chunk of inode numbers is a big one.
+ */
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ if (ctl->free_extents < ctl->extents_thresh ||
+ info->bytes > INODES_PER_BITMAP / 10)
+ return false;
+
+ return true;
+}
+
+static struct btrfs_free_space_op free_ino_op = {
+ .recalc_thresholds = recalculate_thresholds,
+ .use_bitmap = use_bitmap,
+};
+
+static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+}
+
+static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
+ struct btrfs_free_space *info)
+{
+ /*
+ * We always use extents for two reasons:
+ *
+ * - The pinned tree is only used during the process of caching
+ * work.
+ * - Make code simpler. See btrfs_unpin_free_ino().
+ */
+ return false;
+}
+
+static struct btrfs_free_space_op pinned_free_ino_op = {
+ .recalc_thresholds = pinned_recalc_thresholds,
+ .use_bitmap = pinned_use_bitmap,
+};
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+
+ spin_lock_init(&ctl->tree_lock);
+ ctl->unit = 1;
+ ctl->start = 0;
+ ctl->private = NULL;
+ ctl->op = &free_ino_op;
+
+ /*
+ * Initially we allow to use 16K of ram to cache chunks of
+ * inode numbers before we resort to bitmaps. This is somewhat
+ * arbitrary, but it will be adjusted in runtime.
+ */
+ ctl->extents_thresh = INIT_THRESHOLD;
+
+ spin_lock_init(&pinned->tree_lock);
+ pinned->unit = 1;
+ pinned->start = 0;
+ pinned->private = NULL;
+ pinned->extents_thresh = 0;
+ pinned->op = &pinned_free_ino_op;
+}
+
+int btrfs_save_ino_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+ struct btrfs_path *path;
+ struct inode *inode;
+ u64 alloc_hint = 0;
+ int ret;
+ int prealloc;
+ bool retry = false;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+again:
+ inode = lookup_free_ino_inode(root, path);
+ if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+ ret = PTR_ERR(inode);
+ goto out;
+ }
+
+ if (IS_ERR(inode)) {
+ BUG_ON(retry);
+ retry = true;
+
+ ret = create_free_ino_inode(root, trans, path);
+ if (ret)
+ goto out;
+ goto again;
+ }
+
+ BTRFS_I(inode)->generation = 0;
+ ret = btrfs_update_inode(trans, root, inode);
+ WARN_ON(ret);
+
+ if (i_size_read(inode) > 0) {
+ ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+ if (ret)
+ goto out_put;
+ }
+
+ spin_lock(&root->cache_lock);
+ if (root->cached != BTRFS_CACHE_FINISHED) {
+ ret = -1;
+ spin_unlock(&root->cache_lock);
+ goto out_put;
+ }
+ spin_unlock(&root->cache_lock);
+
+ spin_lock(&ctl->tree_lock);
+ prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
+ prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
+ prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+ spin_unlock(&ctl->tree_lock);
+
+ /* Just to make sure we have enough space */
+ prealloc += 8 * PAGE_CACHE_SIZE;
+
+ ret = btrfs_check_data_free_space(inode, prealloc);
+ if (ret)
+ goto out_put;
+
+ ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
+ prealloc, prealloc, &alloc_hint);
+ if (ret)
+ goto out_put;
+ btrfs_free_reserved_data_space(inode, prealloc);
+
+out_put:
+ iput(inode);
+out:
+ if (ret == 0)
+ ret = btrfs_write_out_ino_cache(root, trans, path);
+
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
{
struct btrfs_path *path;
int ret;
@@ -30,7 +465,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
int slot;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
search_key.type = -1;
@@ -54,15 +490,14 @@ error:
return ret;
}
-int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 dirid, u64 *objectid)
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
{
int ret;
mutex_lock(&root->objectid_mutex);
if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
- ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+ ret = btrfs_find_highest_objectid(root,
+ &root->highest_objectid);
if (ret)
goto out;
}
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
new file mode 100644
index 000000000000..ddb347bfee23
--- /dev/null
+++ b/fs/btrfs/inode-map.h
@@ -0,0 +1,13 @@
+#ifndef __BTRFS_INODE_MAP
+#define __BTRFS_INODE_MAP
+
+void btrfs_init_free_ino_ctl(struct btrfs_root *root);
+void btrfs_unpin_free_ino(struct btrfs_root *root);
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
+int btrfs_save_ino_cache(struct btrfs_root *root,
+ struct btrfs_trans_handle *trans);
+
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bcc461a9695f..39a9d5750efd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -37,6 +37,7 @@
#include <linux/posix_acl.h>
#include <linux/falloc.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -50,6 +51,8 @@
#include "tree-log.h"
#include "compression.h"
#include "locking.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
struct btrfs_iget_args {
u64 ino;
@@ -70,6 +73,7 @@ static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_transaction_cachep;
struct kmem_cache *btrfs_path_cachep;
+struct kmem_cache *btrfs_free_space_cachep;
#define S_SHIFT 12
static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -82,7 +86,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
};
-static void btrfs_truncate(struct inode *inode);
+static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_truncate(struct inode *inode);
static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
@@ -90,13 +95,14 @@ static noinline int cow_file_range(struct inode *inode,
unsigned long *nr_written, int unlock);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
{
int err;
err = btrfs_init_acl(trans, inode, dir);
if (!err)
- err = btrfs_xattr_security_init(trans, inode, dir);
+ err = btrfs_xattr_security_init(trans, inode, dir, qstr);
return err;
}
@@ -108,6 +114,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, size_t size, size_t compressed_size,
+ int compress_type,
struct page **compressed_pages)
{
struct btrfs_key key;
@@ -122,12 +129,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t cur_size = size;
size_t datasize;
unsigned long offset;
- int compress_type = BTRFS_COMPRESS_NONE;
- if (compressed_size && compressed_pages) {
- compress_type = root->fs_info->compress_type;
+ if (compressed_size && compressed_pages)
cur_size = compressed_size;
- }
path = btrfs_alloc_path();
if (!path)
@@ -136,7 +140,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
btrfs_set_trans_block_group(trans, inode);
- key.objectid = inode->i_ino;
+ key.objectid = btrfs_ino(inode);
key.offset = start;
btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
datasize = btrfs_file_extent_calc_inline_size(cur_size);
@@ -217,7 +221,7 @@ fail:
static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end,
- size_t compressed_size,
+ size_t compressed_size, int compress_type,
struct page **compressed_pages)
{
u64 isize = i_size_read(inode);
@@ -250,7 +254,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
inline_len = min_t(u64, isize, actual_end);
ret = insert_inline_extent(trans, root, inode, start,
inline_len, compressed_size,
- compressed_pages);
+ compress_type, compressed_pages);
BUG_ON(ret);
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
@@ -287,6 +291,7 @@ static noinline int add_async_extent(struct async_cow *cow,
struct async_extent *async_extent;
async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+ BUG_ON(!async_extent);
async_extent->start = start;
async_extent->ram_size = ram_size;
async_extent->compressed_size = compressed_size;
@@ -337,6 +342,10 @@ static noinline int compress_file_range(struct inode *inode,
int will_compress;
int compress_type = root->fs_info->compress_type;
+ /* if this is a small write inside eof, kick off a defragbot */
+ if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
+ btrfs_add_inode_defrag(NULL, inode);
+
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
@@ -381,9 +390,11 @@ again:
*/
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
(btrfs_test_opt(root, COMPRESS) ||
- (BTRFS_I(inode)->force_compress))) {
+ (BTRFS_I(inode)->force_compress) ||
+ (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
WARN_ON(pages);
pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+ BUG_ON(!pages);
if (BTRFS_I(inode)->force_compress)
compress_type = BTRFS_I(inode)->force_compress;
@@ -426,12 +437,13 @@ again:
* to make an uncompressed inline extent.
*/
ret = cow_file_range_inline(trans, root, inode,
- start, end, 0, NULL);
+ start, end, 0, 0, NULL);
} else {
/* try making a compressed inline extent */
ret = cow_file_range_inline(trans, root, inode,
start, end,
- total_compressed, pages);
+ total_compressed,
+ compress_type, pages);
}
if (ret == 0) {
/*
@@ -643,7 +655,8 @@ retry:
async_extent->start +
async_extent->ram_size - 1, 0);
- em = alloc_extent_map(GFP_NOFS);
+ em = alloc_extent_map();
+ BUG_ON(!em);
em->start = async_extent->start;
em->len = async_extent->ram_size;
em->orig_start = em->start;
@@ -738,6 +751,15 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
return alloc_hint;
}
+static inline bool is_free_space_inode(struct btrfs_root *root,
+ struct inode *inode)
+{
+ if (root == root->fs_info->tree_root ||
+ BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+ return true;
+ return false;
+}
+
/*
* when extent_io.c finds a delayed allocation range in the file,
* the call backs end up in this code. The basic idea is to
@@ -770,7 +792,7 @@ static noinline int cow_file_range(struct inode *inode,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
int ret = 0;
- BUG_ON(root == root->fs_info->tree_root);
+ BUG_ON(is_free_space_inode(root, inode));
trans = btrfs_join_transaction(root, 1);
BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
@@ -781,10 +803,14 @@ static noinline int cow_file_range(struct inode *inode,
disk_num_bytes = num_bytes;
ret = 0;
+ /* if this is a small write inside eof, kick off defrag */
+ if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
+ btrfs_add_inode_defrag(trans, inode);
+
if (start == 0) {
/* lets try to make an inline extent */
ret = cow_file_range_inline(trans, root, inode,
- start, end, 0, NULL);
+ start, end, 0, 0, NULL);
if (ret == 0) {
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
@@ -819,7 +845,8 @@ static noinline int cow_file_range(struct inode *inode,
(u64)-1, &ins, 1);
BUG_ON(ret);
- em = alloc_extent_map(GFP_NOFS);
+ em = alloc_extent_map();
+ BUG_ON(!em);
em->start = start;
em->orig_start = em->start;
ram_size = ins.offset;
@@ -946,6 +973,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1, 0, NULL, GFP_NOFS);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+ BUG_ON(!async_cow);
async_cow->inode = inode;
async_cow->root = root;
async_cow->locked_page = locked_page;
@@ -999,7 +1027,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
LIST_HEAD(list);
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
- bytenr + num_bytes - 1, &list);
+ bytenr + num_bytes - 1, &list, 0);
if (ret == 0 && list_empty(&list))
return 0;
@@ -1040,29 +1068,31 @@ static noinline int run_delalloc_nocow(struct inode *inode,
int type;
int nocow;
int check_prev = 1;
- bool nolock = false;
+ bool nolock;
+ u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
BUG_ON(!path);
- if (root == root->fs_info->tree_root) {
- nolock = true;
+
+ nolock = is_free_space_inode(root, inode);
+
+ if (nolock)
trans = btrfs_join_transaction_nolock(root, 1);
- } else {
+ else
trans = btrfs_join_transaction(root, 1);
- }
BUG_ON(IS_ERR(trans));
cow_start = (u64)-1;
cur_offset = start;
while (1) {
- ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ ret = btrfs_lookup_file_extent(trans, root, path, ino,
cur_offset, 0);
BUG_ON(ret < 0);
if (ret > 0 && path->slots[0] > 0 && check_prev) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key,
path->slots[0] - 1);
- if (found_key.objectid == inode->i_ino &&
+ if (found_key.objectid == ino &&
found_key.type == BTRFS_EXTENT_DATA_KEY)
path->slots[0]--;
}
@@ -1083,7 +1113,7 @@ next_slot:
num_bytes = 0;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid > inode->i_ino ||
+ if (found_key.objectid > ino ||
found_key.type > BTRFS_EXTENT_DATA_KEY ||
found_key.offset > end)
break;
@@ -1118,7 +1148,7 @@ next_slot:
goto out_check;
if (btrfs_extent_readonly(root, disk_bytenr))
goto out_check;
- if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+ if (btrfs_cross_ref_exist(trans, root, ino,
found_key.offset -
extent_offset, disk_bytenr))
goto out_check;
@@ -1155,7 +1185,7 @@ out_check:
goto next_slot;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (cow_start != (u64)-1) {
ret = cow_file_range(inode, locked_page, cow_start,
found_key.offset - 1, page_started,
@@ -1168,7 +1198,8 @@ out_check:
struct extent_map *em;
struct extent_map_tree *em_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
- em = alloc_extent_map(GFP_NOFS);
+ em = alloc_extent_map();
+ BUG_ON(!em);
em->start = cur_offset;
em->orig_start = em->start;
em->len = num_bytes;
@@ -1212,7 +1243,7 @@ out_check:
if (cur_offset > end)
break;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (cur_offset <= end && cow_start == (u64)-1)
cow_start = cur_offset;
@@ -1250,7 +1281,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
else if (!btrfs_test_opt(root, COMPRESS) &&
- !(BTRFS_I(inode)->force_compress))
+ !(BTRFS_I(inode)->force_compress) &&
+ !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
ret = cow_file_range(inode, locked_page, start, end,
page_started, nr_written, 1);
else
@@ -1299,14 +1331,13 @@ static int btrfs_set_bit_hook(struct inode *inode,
/*
* set_bit and clear bit hooks normally require _irqsave/restore
- * but in this case, we are only testeing for the DELALLOC
+ * but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- int do_list = (root->root_key.objectid !=
- BTRFS_ROOT_TREE_OBJECTID);
+ bool do_list = !is_free_space_inode(root, inode);
if (*bits & EXTENT_FIRST_DELALLOC)
*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1333,14 +1364,13 @@ static int btrfs_clear_bit_hook(struct inode *inode,
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
- * but in this case, we are only testeing for the DELALLOC
+ * but in this case, we are only testing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 len = state->end + 1 - state->start;
- int do_list = (root->root_key.objectid !=
- BTRFS_ROOT_TREE_OBJECTID);
+ bool do_list = !is_free_space_inode(root, inode);
if (*bits & EXTENT_FIRST_DELALLOC)
*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1447,7 +1477,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
- if (root == root->fs_info->tree_root)
+ if (is_free_space_inode(root, inode))
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
else
ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
@@ -1457,8 +1487,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
if (bio_flags & EXTENT_BIO_COMPRESSED) {
return btrfs_submit_compressed_read(inode, bio,
mirror_num, bio_flags);
- } else if (!skip_sum)
- btrfs_lookup_bio_sums(root, inode, bio, NULL);
+ } else if (!skip_sum) {
+ ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
+ if (ret)
+ return ret;
+ }
goto mapit;
} else if (!skip_sum) {
/* csum items have already been cloned */
@@ -1630,7 +1663,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
&hint, 0);
BUG_ON(ret);
- ins.objectid = inode->i_ino;
+ ins.objectid = btrfs_ino(inode);
ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
@@ -1661,7 +1694,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
- inode->i_ino, file_pos, &ins);
+ btrfs_ino(inode), file_pos, &ins);
BUG_ON(ret);
btrfs_free_path(path);
@@ -1687,7 +1720,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
struct extent_state *cached_state = NULL;
int compress_type = 0;
int ret;
- bool nolock = false;
+ bool nolock;
ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
end - start + 1);
@@ -1695,7 +1728,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
return 0;
BUG_ON(!ordered_extent);
- nolock = (root == root->fs_info->tree_root);
+ nolock = is_free_space_inode(root, inode);
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list));
@@ -1757,9 +1790,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
- btrfs_ordered_update_i_size(inode, 0, ordered_extent);
- ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
+ ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+ if (!ret) {
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ }
+ ret = 0;
out:
if (nolock) {
if (trans)
@@ -1781,6 +1817,8 @@ out:
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate)
{
+ trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
+
ClearPagePrivate2(page);
return btrfs_finish_ordered_io(page->mapping->host, start, end);
}
@@ -1836,7 +1874,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
}
read_unlock(&em_tree->lock);
- if (!em || IS_ERR(em)) {
+ if (IS_ERR_OR_NULL(em)) {
kfree(failrec);
return -EIO;
}
@@ -1891,10 +1929,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
else
rw = READ;
- BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+ ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
failrec->last_mirror,
failrec->bio_flags, 0);
- return 0;
+ return ret;
}
/*
@@ -1910,7 +1948,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
private = 0;
if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
- (u64)-1, 1, EXTENT_DIRTY)) {
+ (u64)-1, 1, EXTENT_DIRTY, 0)) {
ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
start, &private_failure);
if (ret == 0) {
@@ -1985,12 +2023,11 @@ good:
return 0;
zeroit:
- if (printk_ratelimit()) {
- printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
- "private %llu\n", page->mapping->host->i_ino,
+ printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
+ "private %llu\n",
+ (unsigned long long)btrfs_ino(page->mapping->host),
(unsigned long long)start, csum,
(unsigned long long)private);
- }
memset(kaddr + offset, 1, end - start + 1);
flush_dcache_page(page);
kunmap_atomic(kaddr, KM_USER0);
@@ -2206,8 +2243,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
insert = 1;
#endif
insert = 1;
- } else {
- WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
}
if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2227,7 +2262,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
/* insert an orphan item to track this unlinked/truncated file */
if (insert >= 1) {
- ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+ ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
BUG_ON(ret);
}
@@ -2264,7 +2299,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
spin_unlock(&root->orphan_lock);
if (trans && delete_item) {
- ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+ ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
BUG_ON(ret);
}
@@ -2278,7 +2313,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
* this cleans up any orphans that may be left on the list from the last use
* of this root.
*/
-void btrfs_orphan_cleanup(struct btrfs_root *root)
+int btrfs_orphan_cleanup(struct btrfs_root *root)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -2288,10 +2323,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
int ret = 0, nr_unlink = 0, nr_truncate = 0;
if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
- return;
+ return 0;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
path->reada = -1;
key.objectid = BTRFS_ORPHAN_OBJECTID;
@@ -2300,18 +2338,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- printk(KERN_ERR "Error searching slot for orphan: %d"
- "\n", ret);
- break;
- }
+ if (ret < 0)
+ goto out;
/*
* if ret == 0 means we found what we were searching for, which
- * is weird, but possible, so only screw with path if we didnt
+ * is weird, but possible, so only screw with path if we didn't
* find the key and see if we have stuff that matches
*/
if (ret > 0) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -2328,7 +2364,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
break;
/* release the path since we're done with it */
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
/*
* this is where we are basically btrfs_lookup, without the
@@ -2339,7 +2375,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
- BUG_ON(IS_ERR(inode));
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ goto out;
+ }
/*
* add this inode to the orphan list so btrfs_orphan_del does
@@ -2357,7 +2396,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
*/
if (is_bad_inode(inode)) {
trans = btrfs_start_transaction(root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
btrfs_orphan_del(trans, inode);
btrfs_end_transaction(trans, root);
iput(inode);
@@ -2366,17 +2408,22 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
/* if we have links, this was a truncate, lets do that */
if (inode->i_nlink) {
+ if (!S_ISREG(inode->i_mode)) {
+ WARN_ON(1);
+ iput(inode);
+ continue;
+ }
nr_truncate++;
- btrfs_truncate(inode);
+ ret = btrfs_truncate(inode);
} else {
nr_unlink++;
}
/* this will do delete_inode and everything for us */
iput(inode);
+ if (ret)
+ goto out;
}
- btrfs_free_path(path);
-
root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
if (root->orphan_block_rsv)
@@ -2385,14 +2432,20 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
if (root->orphan_block_rsv || root->orphan_item_inserted) {
trans = btrfs_join_transaction(root, 1);
- BUG_ON(IS_ERR(trans));
- btrfs_end_transaction(trans, root);
+ if (!IS_ERR(trans))
+ btrfs_end_transaction(trans, root);
}
if (nr_unlink)
printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
if (nr_truncate)
printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+
+out:
+ if (ret)
+ printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
}
/*
@@ -2508,7 +2561,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
* try to precache a NULL acl entry for files that don't have
* any xattrs or acls
*/
- maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+ maybe_acls = acls_after_inode_item(leaf, path->slots[0],
+ btrfs_ino(inode));
if (!maybe_acls)
cache_no_acl(inode);
@@ -2559,6 +2613,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode)
{
+ if (!leaf->map_token)
+ map_private_extent_buffer(leaf, (unsigned long)item,
+ sizeof(struct btrfs_inode_item),
+ &leaf->map_token, &leaf->kaddr,
+ &leaf->map_start, &leaf->map_len,
+ KM_USER1);
+
btrfs_set_inode_uid(leaf, item, inode->i_uid);
btrfs_set_inode_gid(leaf, item, inode->i_gid);
btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2587,6 +2648,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+
+ if (leaf->map_token) {
+ unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+ leaf->map_token = NULL;
+ }
}
/*
@@ -2600,11 +2666,26 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int ret;
+ /*
+ * If root is tree root, it means this inode is used to
+ * store free space information. And these inodes are updated
+ * when committing the transaction, so they needn't delaye to
+ * be updated, or deadlock will occured.
+ */
+ if (!is_free_space_inode(root, inode)) {
+ ret = btrfs_delayed_update_inode(trans, root, inode);
+ if (!ret)
+ btrfs_set_inode_last_trans(trans, inode);
+ return ret;
+ }
+
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
path->leave_spinning = 1;
- ret = btrfs_lookup_inode(trans, root, path,
- &BTRFS_I(inode)->location, 1);
+ ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
+ 1);
if (ret) {
if (ret > 0)
ret = -ENOENT;
@@ -2614,7 +2695,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
btrfs_unlock_up_safe(path, 1);
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_inode_item);
+ struct btrfs_inode_item);
fill_inode_item(trans, leaf, inode_item, inode);
btrfs_mark_buffer_dirty(leaf);
@@ -2625,16 +2706,15 @@ failed:
return ret;
}
-
/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
* also drops the back refs in the inode to the directory
*/
-int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct inode *dir, struct inode *inode,
- const char *name, int name_len)
+static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, struct inode *inode,
+ const char *name, int name_len)
{
struct btrfs_path *path;
int ret = 0;
@@ -2642,6 +2722,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di;
struct btrfs_key key;
u64 index;
+ u64 ino = btrfs_ino(inode);
+ u64 dir_ino = btrfs_ino(dir);
path = btrfs_alloc_path();
if (!path) {
@@ -2650,7 +2732,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
}
path->leave_spinning = 1;
- di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -2665,33 +2747,23 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret)
goto err;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
- ret = btrfs_del_inode_ref(trans, root, name, name_len,
- inode->i_ino,
- dir->i_ino, &index);
+ ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
+ dir_ino, &index);
if (ret) {
printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
- "inode %lu parent %lu\n", name_len, name,
- inode->i_ino, dir->i_ino);
+ "inode %llu parent %llu\n", name_len, name,
+ (unsigned long long)ino, (unsigned long long)dir_ino);
goto err;
}
- di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
- index, name, name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto err;
- }
- if (!di) {
- ret = -ENOENT;
+ ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
+ if (ret)
goto err;
- }
- ret = btrfs_delete_one_dir_name(trans, root, path, di);
- btrfs_release_path(root, path);
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
- inode, dir->i_ino);
+ inode, dir_ino);
BUG_ON(ret != 0 && ret != -ENOENT);
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
@@ -2706,12 +2778,25 @@ err:
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
btrfs_update_inode(trans, root, dir);
- btrfs_drop_nlink(inode);
- ret = btrfs_update_inode(trans, root, inode);
out:
return ret;
}
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, struct inode *inode,
+ const char *name, int name_len)
+{
+ int ret;
+ ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+ if (!ret) {
+ btrfs_drop_nlink(inode);
+ ret = btrfs_update_inode(trans, root, inode);
+ }
+ return ret;
+}
+
+
/* helper to check if there is any shared block in the path */
static int check_path_shared(struct btrfs_root *root,
struct btrfs_path *path)
@@ -2756,12 +2841,14 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
int check_link = 1;
int err = -ENOSPC;
int ret;
+ u64 ino = btrfs_ino(inode);
+ u64 dir_ino = btrfs_ino(dir);
trans = btrfs_start_transaction(root, 10);
if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
return trans;
- if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return ERR_PTR(-ENOSPC);
/* check if there is someone else holds reference */
@@ -2802,7 +2889,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
} else {
check_link = 0;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = btrfs_lookup_inode(trans, root, path,
&BTRFS_I(inode)->location, 0);
@@ -2816,11 +2903,11 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
} else {
check_link = 0;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (ret == 0 && S_ISREG(inode->i_mode)) {
ret = btrfs_lookup_file_extent(trans, root, path,
- inode->i_ino, (u64)-1, 0);
+ ino, (u64)-1, 0);
if (ret < 0) {
err = ret;
goto out;
@@ -2828,7 +2915,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
BUG_ON(ret == 0);
if (check_path_shared(root, path))
goto out;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
}
if (!check_link) {
@@ -2836,7 +2923,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
goto out;
}
- di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
dentry->d_name.name, dentry->d_name.len, 0);
if (IS_ERR(di)) {
err = PTR_ERR(di);
@@ -2849,11 +2936,11 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
err = 0;
goto out;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ref = btrfs_lookup_inode_ref(trans, root, path,
dentry->d_name.name, dentry->d_name.len,
- inode->i_ino, dir->i_ino, 0);
+ ino, dir_ino, 0);
if (IS_ERR(ref)) {
err = PTR_ERR(ref);
goto out;
@@ -2862,9 +2949,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
if (check_path_shared(root, path))
goto out;
index = btrfs_inode_ref_index(path->nodes[0], ref);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
- di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+ /*
+ * This is a commit root search, if we can lookup inode item and other
+ * relative items in the commit root, it means the transaction of
+ * dir/file creation has been committed, and the dir index item that we
+ * delay to insert has also been inserted into the commit root. So
+ * we needn't worry about the delayed insertion of the dir index item
+ * here.
+ */
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
dentry->d_name.name, dentry->d_name.len, 0);
if (IS_ERR(di)) {
err = PTR_ERR(di);
@@ -2939,54 +3034,47 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_key key;
u64 index;
int ret;
+ u64 dir_ino = btrfs_ino(dir);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
- BUG_ON(!di || IS_ERR(di));
+ BUG_ON(IS_ERR_OR_NULL(di));
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &key);
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
BUG_ON(ret);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
objectid, root->root_key.objectid,
- dir->i_ino, &index, name, name_len);
+ dir_ino, &index, name, name_len);
if (ret < 0) {
BUG_ON(ret != -ENOENT);
- di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+ di = btrfs_search_dir_index_item(root, path, dir_ino,
name, name_len);
- BUG_ON(!di || IS_ERR(di));
+ BUG_ON(IS_ERR_OR_NULL(di));
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
index = key.offset;
}
+ btrfs_release_path(path);
- di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
- index, name, name_len, -1);
- BUG_ON(!di || IS_ERR(di));
-
- leaf = path->nodes[0];
- btrfs_dir_item_key_to_cpu(leaf, di, &key);
- WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
- ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
BUG_ON(ret);
- btrfs_release_path(root, path);
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, dir);
BUG_ON(ret);
- btrfs_free_path(path);
return 0;
}
@@ -2999,7 +3087,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
unsigned long nr = 0;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
- inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
return -ENOTEMPTY;
trans = __unlink_start_trans(dir, dentry);
@@ -3008,7 +3096,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
btrfs_set_trans_block_group(trans, dir);
- if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
err = btrfs_unlink_subvol(trans, root, dir,
BTRFS_I(inode)->location.objectid,
dentry->d_name.name,
@@ -3033,178 +3121,6 @@ out:
return err;
}
-#if 0
-/*
- * when truncating bytes in a file, it is possible to avoid reading
- * the leaves that contain only checksum items. This can be the
- * majority of the IO required to delete a large file, but it must
- * be done carefully.
- *
- * The keys in the level just above the leaves are checked to make sure
- * the lowest key in a given leaf is a csum key, and starts at an offset
- * after the new size.
- *
- * Then the key for the next leaf is checked to make sure it also has
- * a checksum item for the same file. If it does, we know our target leaf
- * contains only checksum items, and it can be safely freed without reading
- * it.
- *
- * This is just an optimization targeted at large files. It may do
- * nothing. It will return 0 unless things went badly.
- */
-static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct inode *inode, u64 new_size)
-{
- struct btrfs_key key;
- int ret;
- int nritems;
- struct btrfs_key found_key;
- struct btrfs_key other_key;
- struct btrfs_leaf_ref *ref;
- u64 leaf_gen;
- u64 leaf_start;
-
- path->lowest_level = 1;
- key.objectid = inode->i_ino;
- key.type = BTRFS_CSUM_ITEM_KEY;
- key.offset = new_size;
-again:
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto out;
-
- if (path->nodes[1] == NULL) {
- ret = 0;
- goto out;
- }
- ret = 0;
- btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
- nritems = btrfs_header_nritems(path->nodes[1]);
-
- if (!nritems)
- goto out;
-
- if (path->slots[1] >= nritems)
- goto next_node;
-
- /* did we find a key greater than anything we want to delete? */
- if (found_key.objectid > inode->i_ino ||
- (found_key.objectid == inode->i_ino && found_key.type > key.type))
- goto out;
-
- /* we check the next key in the node to make sure the leave contains
- * only checksum items. This comparison doesn't work if our
- * leaf is the last one in the node
- */
- if (path->slots[1] + 1 >= nritems) {
-next_node:
- /* search forward from the last key in the node, this
- * will bring us into the next node in the tree
- */
- btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
-
- /* unlikely, but we inc below, so check to be safe */
- if (found_key.offset == (u64)-1)
- goto out;
-
- /* search_forward needs a path with locks held, do the
- * search again for the original key. It is possible
- * this will race with a balance and return a path that
- * we could modify, but this drop is just an optimization
- * and is allowed to miss some leaves.
- */
- btrfs_release_path(root, path);
- found_key.offset++;
-
- /* setup a max key for search_forward */
- other_key.offset = (u64)-1;
- other_key.type = key.type;
- other_key.objectid = key.objectid;
-
- path->keep_locks = 1;
- ret = btrfs_search_forward(root, &found_key, &other_key,
- path, 0, 0);
- path->keep_locks = 0;
- if (ret || found_key.objectid != key.objectid ||
- found_key.type != key.type) {
- ret = 0;
- goto out;
- }
-
- key.offset = found_key.offset;
- btrfs_release_path(root, path);
- cond_resched();
- goto again;
- }
-
- /* we know there's one more slot after us in the tree,
- * read that key so we can verify it is also a checksum item
- */
- btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
-
- if (found_key.objectid < inode->i_ino)
- goto next_key;
-
- if (found_key.type != key.type || found_key.offset < new_size)
- goto next_key;
-
- /*
- * if the key for the next leaf isn't a csum key from this objectid,
- * we can't be sure there aren't good items inside this leaf.
- * Bail out
- */
- if (other_key.objectid != inode->i_ino || other_key.type != key.type)
- goto out;
-
- leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
- leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
- /*
- * it is safe to delete this leaf, it contains only
- * csum items from this inode at an offset >= new_size
- */
- ret = btrfs_del_leaf(trans, root, path, leaf_start);
- BUG_ON(ret);
-
- if (root->ref_cows && leaf_gen < trans->transid) {
- ref = btrfs_alloc_leaf_ref(root, 0);
- if (ref) {
- ref->root_gen = root->root_key.offset;
- ref->bytenr = leaf_start;
- ref->owner = 0;
- ref->generation = leaf_gen;
- ref->nritems = 0;
-
- btrfs_sort_leaf_ref(ref);
-
- ret = btrfs_add_leaf_ref(root, ref, 0);
- WARN_ON(ret);
- btrfs_free_leaf_ref(root, ref);
- } else {
- WARN_ON(1);
- }
- }
-next_key:
- btrfs_release_path(root, path);
-
- if (other_key.objectid == inode->i_ino &&
- other_key.type == key.type && other_key.offset > key.offset) {
- key.offset = other_key.offset;
- cond_resched();
- goto again;
- }
- ret = 0;
-out:
- /* fixup any changes we've made to the path */
- path->lowest_level = 0;
- path->keep_locks = 0;
- btrfs_release_path(root, path);
- return ret;
-}
-
-#endif
-
/*
* this can truncate away extent items, csum items and directory items.
* It starts at a high offset and removes keys until it can't find
@@ -3240,17 +3156,27 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
int encoding;
int ret;
int err = 0;
+ u64 ino = btrfs_ino(inode);
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
if (root->ref_cows || root == root->fs_info->tree_root)
btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+ /*
+ * This function is also used to drop the items in the log tree before
+ * we relog the inode, so if root != BTRFS_I(inode)->root, it means
+ * it is used to drop the loged items. So we shouldn't kill the delayed
+ * items.
+ */
+ if (min_type == 0 && root == BTRFS_I(inode)->root)
+ btrfs_kill_delayed_inode_items(inode);
+
path = btrfs_alloc_path();
BUG_ON(!path);
path->reada = -1;
- key.objectid = inode->i_ino;
+ key.objectid = ino;
key.offset = (u64)-1;
key.type = (u8)-1;
@@ -3278,7 +3204,7 @@ search_again:
found_type = btrfs_key_type(&found_key);
encoding = 0;
- if (found_key.objectid != inode->i_ino)
+ if (found_key.objectid != ino)
break;
if (found_type < min_type)
@@ -3368,7 +3294,6 @@ search_again:
btrfs_file_extent_calc_inline_size(size);
ret = btrfs_truncate_item(trans, root, path,
size, 1);
- BUG_ON(ret);
} else if (root->ref_cows) {
inode_sub_bytes(inode, item_end + 1 -
found_key.offset);
@@ -3397,7 +3322,7 @@ delete:
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
- inode->i_ino, extent_offset);
+ ino, extent_offset);
BUG_ON(ret);
}
@@ -3406,7 +3331,9 @@ delete:
if (path->slots[0] == 0 ||
path->slots[0] != pending_del_slot) {
- if (root->ref_cows) {
+ if (root->ref_cows &&
+ BTRFS_I(inode)->location.objectid !=
+ BTRFS_FREE_INO_OBJECTID) {
err = -EAGAIN;
goto out;
}
@@ -3417,7 +3344,7 @@ delete:
BUG_ON(ret);
pending_del_nr = 0;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto search_again;
} else {
path->slots[0]--;
@@ -3533,7 +3460,13 @@ out:
return ret;
}
-int btrfs_cont_expand(struct inode *inode, loff_t size)
+/*
+ * This function puts in dummy file extents for the area we're creating a hole
+ * for. So if we are truncating this file to a larger size we need to insert
+ * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
+ * the range between oldsize and size
+ */
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3541,7 +3474,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
u64 mask = root->sectorsize - 1;
- u64 hole_start = (inode->i_size + mask) & ~mask;
+ u64 hole_start = (oldsize + mask) & ~mask;
u64 block_end = (size + mask) & ~mask;
u64 last_byte;
u64 cur_offset;
@@ -3569,7 +3502,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
block_end - cur_offset, 0);
- BUG_ON(IS_ERR(em) || !em);
+ BUG_ON(IS_ERR_OR_NULL(em));
last_byte = min(extent_map_end(em), block_end);
last_byte = (last_byte + mask) & ~mask;
if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3586,13 +3519,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
err = btrfs_drop_extents(trans, inode, cur_offset,
cur_offset + hole_size,
&hint_byte, 1);
- BUG_ON(err);
+ if (err)
+ break;
err = btrfs_insert_file_extent(trans, root,
- inode->i_ino, cur_offset, 0,
+ btrfs_ino(inode), cur_offset, 0,
0, hole_size, 0, hole_size,
0, 0, 0);
- BUG_ON(err);
+ if (err)
+ break;
btrfs_drop_extent_cache(inode, hole_start,
last_byte - 1, 0);
@@ -3612,81 +3547,41 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
return err;
}
-static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+static int btrfs_setsize(struct inode *inode, loff_t newsize)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
- unsigned long nr;
+ loff_t oldsize = i_size_read(inode);
int ret;
- if (attr->ia_size == inode->i_size)
+ if (newsize == oldsize)
return 0;
- if (attr->ia_size > inode->i_size) {
- unsigned long limit;
- limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
- if (attr->ia_size > inode->i_sb->s_maxbytes)
- return -EFBIG;
- if (limit != RLIM_INFINITY && attr->ia_size > limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- }
-
- trans = btrfs_start_transaction(root, 5);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- btrfs_set_trans_block_group(trans, inode);
-
- ret = btrfs_orphan_add(trans, inode);
- BUG_ON(ret);
-
- nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
-
- if (attr->ia_size > inode->i_size) {
- ret = btrfs_cont_expand(inode, attr->ia_size);
+ if (newsize > oldsize) {
+ i_size_write(inode, newsize);
+ btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
+ truncate_pagecache(inode, oldsize, newsize);
+ ret = btrfs_cont_expand(inode, oldsize, newsize);
if (ret) {
- btrfs_truncate(inode);
+ btrfs_setsize(inode, oldsize);
return ret;
}
- i_size_write(inode, attr->ia_size);
- btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ mark_inode_dirty(inode);
+ } else {
- trans = btrfs_start_transaction(root, 0);
- BUG_ON(IS_ERR(trans));
- btrfs_set_trans_block_group(trans, inode);
- trans->block_rsv = root->orphan_block_rsv;
- BUG_ON(!trans->block_rsv);
+ /*
+ * We're truncating a file that used to have good data down to
+ * zero. Make sure it gets into the ordered flush list so that
+ * any new writes get down to disk quickly.
+ */
+ if (newsize == 0)
+ BTRFS_I(inode)->ordered_data_close = 1;
- ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
- if (inode->i_nlink > 0) {
- ret = btrfs_orphan_del(trans, inode);
- BUG_ON(ret);
- }
- nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
- btrfs_btree_balance_dirty(root, nr);
- return 0;
+ /* we don't support swapfiles, so vmtruncate shouldn't fail */
+ truncate_setsize(inode, newsize);
+ ret = btrfs_truncate(inode);
}
- /*
- * We're truncating a file that used to have good data down to
- * zero. Make sure it gets into the ordered flush list so that
- * any new writes get down to disk quickly.
- */
- if (attr->ia_size == 0)
- BTRFS_I(inode)->ordered_data_close = 1;
-
- /* we don't support swapfiles, so vmtruncate shouldn't fail */
- ret = vmtruncate(inode, attr->ia_size);
- BUG_ON(ret);
-
- return 0;
+ return ret;
}
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -3703,7 +3598,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
return err;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- err = btrfs_setattr_size(inode, attr);
+ err = btrfs_setsize(inode, attr->ia_size);
if (err)
return err;
}
@@ -3726,9 +3621,11 @@ void btrfs_evict_inode(struct inode *inode)
unsigned long nr;
int ret;
+ trace_btrfs_inode_evict(inode);
+
truncate_inode_pages(&inode->i_data, 0);
if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
- root == root->fs_info->tree_root))
+ is_free_space_inode(root, inode)))
goto no_delete;
if (is_bad_inode(inode)) {
@@ -3781,6 +3678,10 @@ void btrfs_evict_inode(struct inode *inode)
BUG_ON(ret);
}
+ if (!(root == root->fs_info->tree_root ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
+ btrfs_return_ino(root, btrfs_ino(inode));
+
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root, nr);
@@ -3806,12 +3707,12 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
path = btrfs_alloc_path();
BUG_ON(!path);
- di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
namelen, 0);
if (IS_ERR(di))
ret = PTR_ERR(di);
- if (!di || IS_ERR(di))
+ if (IS_ERR_OR_NULL(di))
goto out_err;
btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
@@ -3859,7 +3760,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
- if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+ if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
goto out;
@@ -3869,7 +3770,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
if (ret)
goto out;
- btrfs_release_path(root->fs_info->tree_root, path);
+ btrfs_release_path(path);
new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
if (IS_ERR(new_root)) {
@@ -3898,6 +3799,7 @@ static void inode_tree_add(struct inode *inode)
struct btrfs_inode *entry;
struct rb_node **p;
struct rb_node *parent;
+ u64 ino = btrfs_ino(inode);
again:
p = &root->inode_tree.rb_node;
parent = NULL;
@@ -3910,9 +3812,9 @@ again:
parent = *p;
entry = rb_entry(parent, struct btrfs_inode, rb_node);
- if (inode->i_ino < entry->vfs_inode.i_ino)
+ if (ino < btrfs_ino(&entry->vfs_inode))
p = &parent->rb_left;
- else if (inode->i_ino > entry->vfs_inode.i_ino)
+ else if (ino > btrfs_ino(&entry->vfs_inode))
p = &parent->rb_right;
else {
WARN_ON(!(entry->vfs_inode.i_state &
@@ -3976,9 +3878,9 @@ again:
prev = node;
entry = rb_entry(node, struct btrfs_inode, rb_node);
- if (objectid < entry->vfs_inode.i_ino)
+ if (objectid < btrfs_ino(&entry->vfs_inode))
node = node->rb_left;
- else if (objectid > entry->vfs_inode.i_ino)
+ else if (objectid > btrfs_ino(&entry->vfs_inode))
node = node->rb_right;
else
break;
@@ -3986,7 +3888,7 @@ again:
if (!node) {
while (prev) {
entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= entry->vfs_inode.i_ino) {
+ if (objectid <= btrfs_ino(&entry->vfs_inode)) {
node = prev;
break;
}
@@ -3995,7 +3897,7 @@ again:
}
while (node) {
entry = rb_entry(node, struct btrfs_inode, rb_node);
- objectid = entry->vfs_inode.i_ino + 1;
+ objectid = btrfs_ino(&entry->vfs_inode) + 1;
inode = igrab(&entry->vfs_inode);
if (inode) {
spin_unlock(&root->inode_lock);
@@ -4033,7 +3935,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->ino == inode->i_ino &&
+ return args->ino == btrfs_ino(inode) &&
args->root == BTRFS_I(inode)->root;
}
@@ -4068,7 +3970,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
BTRFS_I(inode)->root = root;
memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
btrfs_read_locked_inode(inode);
-
inode_tree_add(inode);
unlock_new_inode(inode);
if (new)
@@ -4143,8 +4044,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (!IS_ERR(inode) && root != sub_root) {
down_read(&root->fs_info->cleanup_work_sem);
if (!(inode->i_sb->s_flags & MS_RDONLY))
- btrfs_orphan_cleanup(sub_root);
+ ret = btrfs_orphan_cleanup(sub_root);
up_read(&root->fs_info->cleanup_work_sem);
+ if (ret)
+ inode = ERR_PTR(ret);
}
return inode;
@@ -4177,7 +4080,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
return d_splice_alias(inode, dentry);
}
-static unsigned char btrfs_filetype_table[] = {
+unsigned char btrfs_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
@@ -4191,11 +4094,11 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path *path;
+ struct list_head ins_list;
+ struct list_head del_list;
int ret;
- u32 nritems;
struct extent_buffer *leaf;
int slot;
- int advance;
unsigned char d_type;
int over = 0;
u32 di_cur;
@@ -4205,6 +4108,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
char tmp_name[32];
char *name_ptr;
int name_len;
+ int is_curr = 0; /* filp->f_pos points to the current index? */
/* FIXME, use a real flag for deciding about the key type */
if (root->fs_info->tree_root == root)
@@ -4212,9 +4116,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
/* special case for "." */
if (filp->f_pos == 0) {
- over = filldir(dirent, ".", 1,
- 1, inode->i_ino,
- DT_DIR);
+ over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR);
if (over)
return 0;
filp->f_pos = 1;
@@ -4229,36 +4131,36 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
filp->f_pos = 2;
}
path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
path->reada = 2;
+ if (key_type == BTRFS_DIR_INDEX_KEY) {
+ INIT_LIST_HEAD(&ins_list);
+ INIT_LIST_HEAD(&del_list);
+ btrfs_get_delayed_items(inode, &ins_list, &del_list);
+ }
+
btrfs_set_key_type(&key, key_type);
key.offset = filp->f_pos;
- key.objectid = inode->i_ino;
+ key.objectid = btrfs_ino(inode);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto err;
- advance = 0;
while (1) {
leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
slot = path->slots[0];
- if (advance || slot >= nritems) {
- if (slot >= nritems - 1) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- slot = path->slots[0];
- } else {
- slot++;
- path->slots[0]++;
- }
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto err;
+ else if (ret > 0)
+ break;
+ continue;
}
- advance = 1;
item = btrfs_item_nr(leaf, slot);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -4267,9 +4169,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
if (btrfs_key_type(&found_key) != key_type)
break;
if (found_key.offset < filp->f_pos)
- continue;
+ goto next;
+ if (key_type == BTRFS_DIR_INDEX_KEY &&
+ btrfs_should_delete_dir_index(&del_list,
+ found_key.offset))
+ goto next;
filp->f_pos = found_key.offset;
+ is_curr = 1;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
di_cur = 0;
@@ -4278,6 +4185,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
while (di_cur < di_total) {
struct btrfs_key location;
+ if (verify_dir_item(root, leaf, di))
+ break;
+
name_len = btrfs_dir_name_len(leaf, di);
if (name_len <= sizeof(tmp_name)) {
name_ptr = tmp_name;
@@ -4317,6 +4227,17 @@ skip:
di_cur += di_len;
di = (struct btrfs_dir_item *)((char *)di + di_len);
}
+next:
+ path->slots[0]++;
+ }
+
+ if (key_type == BTRFS_DIR_INDEX_KEY) {
+ if (is_curr)
+ filp->f_pos++;
+ ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
+ &ins_list);
+ if (ret)
+ goto nopos;
}
/* Reached end of directory/root. Bump pos past the last item. */
@@ -4331,6 +4252,8 @@ skip:
nopos:
ret = 0;
err:
+ if (key_type == BTRFS_DIR_INDEX_KEY)
+ btrfs_put_delayed_items(&ins_list, &del_list);
btrfs_free_path(path);
return ret;
}
@@ -4346,7 +4269,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
return 0;
smp_mb();
- nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
+ if (root->fs_info->closing && is_free_space_inode(root, inode))
+ nolock = true;
if (wbc->sync_mode == WB_SYNC_ALL) {
if (nolock)
@@ -4370,7 +4294,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
* FIXME, needs more benchmarking...there are no reasons other than performance
* to keep or drop this code.
*/
-void btrfs_dirty_inode(struct inode *inode)
+void btrfs_dirty_inode(struct inode *inode, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
@@ -4389,25 +4313,25 @@ void btrfs_dirty_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
- if (printk_ratelimit()) {
- printk(KERN_ERR "btrfs: fail to "
- "dirty inode %lu error %ld\n",
- inode->i_ino, PTR_ERR(trans));
- }
+ printk_ratelimited(KERN_ERR "btrfs: fail to "
+ "dirty inode %llu error %ld\n",
+ (unsigned long long)btrfs_ino(inode),
+ PTR_ERR(trans));
return;
}
btrfs_set_trans_block_group(trans, inode);
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
- if (printk_ratelimit()) {
- printk(KERN_ERR "btrfs: fail to "
- "dirty inode %lu error %d\n",
- inode->i_ino, ret);
- }
+ printk_ratelimited(KERN_ERR "btrfs: fail to "
+ "dirty inode %llu error %d\n",
+ (unsigned long long)btrfs_ino(inode),
+ ret);
}
}
btrfs_end_transaction(trans, root);
+ if (BTRFS_I(inode)->delayed_node)
+ btrfs_balance_delayed_items(root);
}
/*
@@ -4423,7 +4347,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
struct extent_buffer *leaf;
int ret;
- key.objectid = inode->i_ino;
+ key.objectid = btrfs_ino(inode);
btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
key.offset = (u64)-1;
@@ -4455,7 +4379,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid != inode->i_ino ||
+ if (found_key.objectid != btrfs_ino(inode) ||
btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
BTRFS_I(inode)->index_cnt = 2;
goto out;
@@ -4476,9 +4400,12 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
int ret = 0;
if (BTRFS_I(dir)->index_cnt == (u64)-1) {
- ret = btrfs_set_inode_index_count(dir);
- if (ret)
- return ret;
+ ret = btrfs_inode_delayed_dir_index_count(dir);
+ if (ret) {
+ ret = btrfs_set_inode_index_count(dir);
+ if (ret)
+ return ret;
+ }
}
*index = BTRFS_I(dir)->index_cnt;
@@ -4509,12 +4436,23 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
BUG_ON(!path);
inode = new_inode(root->fs_info->sb);
- if (!inode)
+ if (!inode) {
+ btrfs_free_path(path);
return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * we have to initialize this early, so we can reclaim the inode
+ * number if we fail afterwards in this function.
+ */
+ inode->i_ino = objectid;
if (dir) {
+ trace_btrfs_inode_request(dir);
+
ret = btrfs_set_inode_index(dir, index);
if (ret) {
+ btrfs_free_path(path);
iput(inode);
return ERR_PTR(ret);
}
@@ -4554,7 +4492,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
goto fail;
inode_init_owner(inode, dir, mode);
- inode->i_ino = objectid;
inode_set_bytes(inode, 0);
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -4581,12 +4518,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if ((mode & S_IFREG)) {
if (btrfs_test_opt(root, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(root, NODATACOW))
+ if (btrfs_test_opt(root, NODATACOW) ||
+ (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
}
insert_inode_hash(inode);
inode_tree_add(inode);
+
+ trace_btrfs_inode_new(inode);
+
return inode;
fail:
if (dir)
@@ -4614,29 +4555,29 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
int ret = 0;
struct btrfs_key key;
struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+ u64 ino = btrfs_ino(inode);
+ u64 parent_ino = btrfs_ino(parent_inode);
- if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
} else {
- key.objectid = inode->i_ino;
+ key.objectid = ino;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
}
- if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
key.objectid, root->root_key.objectid,
- parent_inode->i_ino,
- index, name, name_len);
+ parent_ino, index, name, name_len);
} else if (add_backref) {
- ret = btrfs_insert_inode_ref(trans, root,
- name, name_len, inode->i_ino,
- parent_inode->i_ino, index);
+ ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
+ parent_ino, index);
}
if (ret == 0) {
ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode->i_ino, &key,
+ parent_inode, &key,
btrfs_inode_type(inode), index);
BUG_ON(ret);
@@ -4679,10 +4620,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
- err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
- if (err)
- return err;
-
/*
* 2 for inode item and ref
* 2 for dir items
@@ -4694,14 +4631,19 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
btrfs_set_trans_block_group(trans, dir);
+ err = btrfs_find_free_ino(root, &objectid);
+ if (err)
+ goto out_unlock;
+
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, dir->i_ino, objectid,
+ dentry->d_name.len, btrfs_ino(dir), objectid,
BTRFS_I(dir)->block_group, mode, &index);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
goto out_unlock;
+ }
- err = btrfs_init_inode_security(trans, inode, dir);
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err) {
drop_inode = 1;
goto out_unlock;
@@ -4741,9 +4683,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
- err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
- if (err)
- return err;
/*
* 2 for inode item and ref
* 2 for dir items
@@ -4755,14 +4694,19 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
btrfs_set_trans_block_group(trans, dir);
+ err = btrfs_find_free_ino(root, &objectid);
+ if (err)
+ goto out_unlock;
+
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, dir->i_ino, objectid,
+ dentry->d_name.len, btrfs_ino(dir), objectid,
BTRFS_I(dir)->block_group, mode, &index);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
goto out_unlock;
+ }
- err = btrfs_init_inode_security(trans, inode, dir);
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err) {
drop_inode = 1;
goto out_unlock;
@@ -4803,30 +4747,31 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
int err;
int drop_inode = 0;
- if (inode->i_nlink == 0)
- return -ENOENT;
-
/* do not allow sys_link's with other subvols of the same device */
if (root->objectid != BTRFS_I(inode)->root->objectid)
- return -EPERM;
+ return -EXDEV;
- btrfs_inc_nlink(inode);
- inode->i_ctime = CURRENT_TIME;
+ if (inode->i_nlink == ~0U)
+ return -EMLINK;
err = btrfs_set_inode_index(dir, &index);
if (err)
goto fail;
/*
- * 1 item for inode ref
+ * 2 items for inode and inode ref
* 2 items for dir items
+ * 1 item for parent inode
*/
- trans = btrfs_start_transaction(root, 3);
+ trans = btrfs_start_transaction(root, 5);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto fail;
}
+ btrfs_inc_nlink(inode);
+ inode->i_ctime = CURRENT_TIME;
+
btrfs_set_trans_block_group(trans, dir);
ihold(inode);
@@ -4865,10 +4810,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
u64 index = 0;
unsigned long nr = 1;
- err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
- if (err)
- return err;
-
/*
* 2 items for inode and ref
* 2 items for dir items
@@ -4879,8 +4820,12 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return PTR_ERR(trans);
btrfs_set_trans_block_group(trans, dir);
+ err = btrfs_find_free_ino(root, &objectid);
+ if (err)
+ goto out_fail;
+
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, dir->i_ino, objectid,
+ dentry->d_name.len, btrfs_ino(dir), objectid,
BTRFS_I(dir)->block_group, S_IFDIR | mode,
&index);
if (IS_ERR(inode)) {
@@ -4890,7 +4835,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
drop_on_err = 1;
- err = btrfs_init_inode_security(trans, inode, dir);
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
goto out_fail;
@@ -4964,6 +4909,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
inline_size = btrfs_file_extent_inline_item_len(leaf,
btrfs_item_nr(leaf, path->slots[0]));
tmp = kmalloc(inline_size, GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
ptr = btrfs_file_extent_inline_start(item);
read_extent_buffer(leaf, tmp, ptr, inline_size);
@@ -5001,7 +4948,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
u64 bytenr;
u64 extent_start = 0;
u64 extent_end = 0;
- u64 objectid = inode->i_ino;
+ u64 objectid = btrfs_ino(inode);
u32 found_type;
struct btrfs_path *path = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5029,7 +4976,7 @@ again:
else
goto out;
}
- em = alloc_extent_map(GFP_NOFS);
+ em = alloc_extent_map();
if (!em) {
err = -ENOMEM;
goto out;
@@ -5183,7 +5130,7 @@ again:
kunmap(page);
free_extent_map(em);
em = NULL;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
trans = btrfs_join_transaction(root, 1);
if (IS_ERR(trans))
return ERR_CAST(trans);
@@ -5196,7 +5143,7 @@ again:
btrfs_mark_buffer_dirty(leaf);
}
set_extent_uptodate(io_tree, em->start,
- extent_map_end(em) - 1, GFP_NOFS);
+ extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
} else {
printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
@@ -5209,7 +5156,7 @@ not_found_em:
em->block_start = EXTENT_MAP_HOLE;
set_bit(EXTENT_FLAG_VACANCY, &em->flags);
insert:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
"[%llu %llu]\n", (unsigned long long)em->start,
@@ -5263,6 +5210,9 @@ insert:
}
write_unlock(&em_tree->lock);
out:
+
+ trace_btrfs_get_extent(root, em);
+
if (path)
btrfs_free_path(path);
if (trans) {
@@ -5277,23 +5227,161 @@ out:
return em;
}
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create)
+{
+ struct extent_map *em;
+ struct extent_map *hole_em = NULL;
+ u64 range_start = start;
+ u64 end;
+ u64 found;
+ u64 found_end;
+ int err = 0;
+
+ em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+ if (IS_ERR(em))
+ return em;
+ if (em) {
+ /*
+ * if our em maps to a hole, there might
+ * actually be delalloc bytes behind it
+ */
+ if (em->block_start != EXTENT_MAP_HOLE)
+ return em;
+ else
+ hole_em = em;
+ }
+
+ /* check to see if we've wrapped (len == -1 or similar) */
+ end = start + len;
+ if (end < start)
+ end = (u64)-1;
+ else
+ end -= 1;
+
+ em = NULL;
+
+ /* ok, we didn't find anything, lets look for delalloc */
+ found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+ end, len, EXTENT_DELALLOC, 1);
+ found_end = range_start + found;
+ if (found_end < range_start)
+ found_end = (u64)-1;
+
+ /*
+ * we didn't find anything useful, return
+ * the original results from get_extent()
+ */
+ if (range_start > end || found_end <= start) {
+ em = hole_em;
+ hole_em = NULL;
+ goto out;
+ }
+
+ /* adjust the range_start to make sure it doesn't
+ * go backwards from the start they passed in
+ */
+ range_start = max(start,range_start);
+ found = found_end - range_start;
+
+ if (found > 0) {
+ u64 hole_start = start;
+ u64 hole_len = len;
+
+ em = alloc_extent_map();
+ if (!em) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /*
+ * when btrfs_get_extent can't find anything it
+ * returns one huge hole
+ *
+ * make sure what it found really fits our range, and
+ * adjust to make sure it is based on the start from
+ * the caller
+ */
+ if (hole_em) {
+ u64 calc_end = extent_map_end(hole_em);
+
+ if (calc_end <= start || (hole_em->start > end)) {
+ free_extent_map(hole_em);
+ hole_em = NULL;
+ } else {
+ hole_start = max(hole_em->start, start);
+ hole_len = calc_end - hole_start;
+ }
+ }
+ em->bdev = NULL;
+ if (hole_em && range_start > hole_start) {
+ /* our hole starts before our delalloc, so we
+ * have to return just the parts of the hole
+ * that go until the delalloc starts
+ */
+ em->len = min(hole_len,
+ range_start - hole_start);
+ em->start = hole_start;
+ em->orig_start = hole_start;
+ /*
+ * don't adjust block start at all,
+ * it is fixed at EXTENT_MAP_HOLE
+ */
+ em->block_start = hole_em->block_start;
+ em->block_len = hole_len;
+ } else {
+ em->start = range_start;
+ em->len = found;
+ em->orig_start = range_start;
+ em->block_start = EXTENT_MAP_DELALLOC;
+ em->block_len = found;
+ }
+ } else if (hole_em) {
+ return hole_em;
+ }
+out:
+
+ free_extent_map(hole_em);
+ if (err) {
+ free_extent_map(em);
+ return ERR_PTR(err);
+ }
+ return em;
+}
+
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+ struct extent_map *em,
u64 start, u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- struct extent_map *em;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct btrfs_key ins;
u64 alloc_hint;
int ret;
+ bool insert = false;
- btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+ /*
+ * Ok if the extent map we looked up is a hole and is for the exact
+ * range we want, there is no reason to allocate a new one, however if
+ * it is not right then we need to free this one and drop the cache for
+ * our range.
+ */
+ if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
+ em->len != len) {
+ free_extent_map(em);
+ em = NULL;
+ insert = true;
+ btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+ }
trans = btrfs_join_transaction(root, 0);
if (IS_ERR(trans))
return ERR_CAST(trans);
+ if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
+ btrfs_add_inode_defrag(trans, inode);
+
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
alloc_hint = get_extent_allocation_hint(inode, start, len);
@@ -5304,10 +5392,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
goto out;
}
- em = alloc_extent_map(GFP_NOFS);
if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
+ em = alloc_extent_map();
+ if (!em) {
+ em = ERR_PTR(-ENOMEM);
+ goto out;
+ }
}
em->start = start;
@@ -5317,9 +5407,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
em->block_start = ins.objectid;
em->block_len = ins.offset;
em->bdev = root->fs_info->fs_devices->latest_bdev;
+
+ /*
+ * We need to do this because if we're using the original em we searched
+ * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
+ */
+ em->flags = 0;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
- while (1) {
+ while (insert) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
@@ -5363,7 +5459,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
offset, 0);
if (ret < 0)
goto out;
@@ -5380,7 +5476,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid != inode->i_ino ||
+ if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
/* not our file or wrong item type, must cow */
goto out;
@@ -5414,7 +5510,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
* look for other files referencing this extent, if we
* find any we must cow
*/
- if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+ if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
key.offset - backref_offset, disk_bytenr))
goto out;
@@ -5537,8 +5633,7 @@ must_cow:
* it above
*/
len = bh_result->b_size;
- free_extent_map(em);
- em = btrfs_new_extent_direct(inode, start, len);
+ em = btrfs_new_extent_direct(inode, em, start, len);
if (IS_ERR(em))
return PTR_ERR(em);
len = min(len, em->len - (start - em->start));
@@ -5605,9 +5700,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
flush_dcache_page(bvec->bv_page);
if (csum != *private) {
- printk(KERN_ERR "btrfs csum failed ino %lu off"
+ printk(KERN_ERR "btrfs csum failed ino %llu off"
" %llu csum %u private %u\n",
- inode->i_ino, (unsigned long long)start,
+ (unsigned long long)btrfs_ino(inode),
+ (unsigned long long)start,
csum, *private);
err = -EIO;
}
@@ -5624,6 +5720,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
kfree(dip->csums);
kfree(dip);
+
+ /* If we had a csum failure make sure to clear the uptodate flag */
+ if (err)
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
dio_end_io(bio, err);
}
@@ -5697,8 +5797,10 @@ again:
}
add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
- btrfs_ordered_update_i_size(inode, 0, ordered);
- btrfs_update_inode(trans, root, inode);
+ ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+ if (!ret)
+ btrfs_update_inode(trans, root, inode);
+ ret = 0;
out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
ordered->file_offset + ordered->len - 1,
@@ -5725,6 +5827,10 @@ out_done:
kfree(dip->csums);
kfree(dip);
+
+ /* If we had an error make sure to clear the uptodate flag */
+ if (err)
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
dio_end_io(bio, err);
}
@@ -5744,9 +5850,9 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
struct btrfs_dio_private *dip = bio->bi_private;
if (err) {
- printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
+ printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
"sector %#Lx len %u err no %d\n",
- dip->inode->i_ino, bio->bi_rw,
+ (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
(unsigned long long)bio->bi_sector, bio->bi_size, err);
dip->errors = 1;
@@ -5780,7 +5886,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
int rw, u64 file_offset, int skip_sum,
- u32 *csums)
+ u32 *csums, int async_submit)
{
int write = rw & REQ_WRITE;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5791,18 +5897,33 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
if (ret)
goto err;
- if (write && !skip_sum) {
+ if (skip_sum)
+ goto map;
+
+ if (write && async_submit) {
ret = btrfs_wq_submit_bio(root->fs_info,
inode, rw, bio, 0, 0,
file_offset,
__btrfs_submit_bio_start_direct_io,
__btrfs_submit_bio_done);
goto err;
- } else if (!skip_sum)
- btrfs_lookup_bio_sums_dio(root, inode, bio,
+ } else if (write) {
+ /*
+ * If we aren't doing async submit, calculate the csum of the
+ * bio now.
+ */
+ ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+ if (ret)
+ goto err;
+ } else if (!skip_sum) {
+ ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
file_offset, csums);
+ if (ret)
+ goto err;
+ }
- ret = btrfs_map_bio(root, rw, bio, 0, 1);
+map:
+ ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
err:
bio_put(bio);
return ret;
@@ -5824,22 +5945,30 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
int nr_pages = 0;
u32 *csums = dip->csums;
int ret = 0;
-
- bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
- if (!bio)
- return -ENOMEM;
- bio->bi_private = dip;
- bio->bi_end_io = btrfs_end_dio_bio;
- atomic_inc(&dip->pending_bios);
+ int async_submit = 0;
+ int write = rw & REQ_WRITE;
map_length = orig_bio->bi_size;
ret = btrfs_map_block(map_tree, READ, start_sector << 9,
&map_length, NULL, 0);
if (ret) {
- bio_put(bio);
+ bio_put(orig_bio);
return -EIO;
}
+ if (map_length >= orig_bio->bi_size) {
+ bio = orig_bio;
+ goto submit;
+ }
+
+ async_submit = 1;
+ bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+ if (!bio)
+ return -ENOMEM;
+ bio->bi_private = dip;
+ bio->bi_end_io = btrfs_end_dio_bio;
+ atomic_inc(&dip->pending_bios);
+
while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
if (unlikely(map_length < submit_len + bvec->bv_len ||
bio_add_page(bio, bvec->bv_page, bvec->bv_len,
@@ -5853,14 +5982,15 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
atomic_inc(&dip->pending_bios);
ret = __btrfs_submit_dio_bio(bio, inode, rw,
file_offset, skip_sum,
- csums);
+ csums, async_submit);
if (ret) {
bio_put(bio);
atomic_dec(&dip->pending_bios);
goto out_err;
}
- if (!skip_sum)
+ /* Write's use the ordered csums */
+ if (!write && !skip_sum)
csums = csums + nr_pages;
start_sector += submit_len >> 9;
file_offset += submit_len;
@@ -5889,8 +6019,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
}
}
+submit:
ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
- csums);
+ csums, async_submit);
if (!ret)
return 0;
@@ -5928,9 +6059,11 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
}
dip->csums = NULL;
- if (!skip_sum) {
+ /* Write's use the ordered csum stuff, so we don't need dip->csums */
+ if (!write && !skip_sum) {
dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
if (!dip->csums) {
+ kfree(dip);
ret = -ENOMEM;
goto free_ordered;
}
@@ -5983,6 +6116,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
unsigned long nr_segs)
{
int seg;
+ int i;
size_t size;
unsigned long addr;
unsigned blocksize_mask = root->sectorsize - 1;
@@ -5997,8 +6131,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
addr = (unsigned long)iov[seg].iov_base;
size = iov[seg].iov_len;
end += size;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
goto out;
+
+ /* If this is a write we don't need to check anymore */
+ if (rw & WRITE)
+ continue;
+
+ /*
+ * Check to make sure we don't have duplicate iov_base's in this
+ * iovec, if so return EINVAL, otherwise we'll get csum errors
+ * when reading back.
+ */
+ for (i = seg + 1; i < nr_segs; i++) {
+ if (iov[seg].iov_base == iov[i].iov_base)
+ goto out;
+ }
}
retval = 0;
out:
@@ -6099,7 +6247,7 @@ out:
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
- return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+ return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
}
int btrfs_readpage(struct file *file, struct page *page)
@@ -6349,28 +6497,42 @@ out:
return ret;
}
-static void btrfs_truncate(struct inode *inode)
+static int btrfs_truncate(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
+ int err = 0;
struct btrfs_trans_handle *trans;
unsigned long nr;
u64 mask = root->sectorsize - 1;
- if (!S_ISREG(inode->i_mode)) {
- WARN_ON(1);
- return;
- }
-
ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
if (ret)
- return;
+ return ret;
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = btrfs_orphan_add(trans, inode);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+
+ nr = trans->blocks_used;
+ btrfs_end_transaction(trans, root);
+ btrfs_btree_balance_dirty(root, nr);
+
+ /* Now start a transaction for the truncate */
trans = btrfs_start_transaction(root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = root->orphan_block_rsv;
@@ -6397,29 +6559,38 @@ static void btrfs_truncate(struct inode *inode)
while (1) {
if (!trans) {
trans = btrfs_start_transaction(root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
btrfs_set_trans_block_group(trans, inode);
trans->block_rsv = root->orphan_block_rsv;
}
ret = btrfs_block_rsv_check(trans, root,
root->orphan_block_rsv, 0, 5);
- if (ret) {
- BUG_ON(ret != -EAGAIN);
+ if (ret == -EAGAIN) {
ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
+ if (ret)
+ return ret;
trans = NULL;
continue;
+ } else if (ret) {
+ err = ret;
+ break;
}
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
- if (ret != -EAGAIN)
+ if (ret != -EAGAIN) {
+ err = ret;
break;
+ }
ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ break;
+ }
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
@@ -6429,16 +6600,27 @@ static void btrfs_truncate(struct inode *inode)
if (ret == 0 && inode->i_nlink > 0) {
ret = btrfs_orphan_del(trans, inode);
- BUG_ON(ret);
+ if (ret)
+ err = ret;
+ } else if (ret && inode->i_nlink > 0) {
+ /*
+ * Failed to do the truncate, remove us from the in memory
+ * orphan list.
+ */
+ ret = btrfs_orphan_del(NULL, inode);
}
ret = btrfs_update_inode(trans, root, inode);
- BUG_ON(ret);
+ if (ret && !err)
+ err = ret;
nr = trans->blocks_used;
ret = btrfs_end_transaction_throttle(trans, root);
- BUG_ON(ret);
+ if (ret && !err)
+ err = ret;
btrfs_btree_balance_dirty(root, nr);
+
+ return err;
}
/*
@@ -6505,19 +6687,21 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->index_cnt = (u64)-1;
ei->last_unlink_trans = 0;
- spin_lock_init(&ei->accounting_lock);
atomic_set(&ei->outstanding_extents, 0);
- ei->reserved_extents = 0;
+ atomic_set(&ei->reserved_extents, 0);
ei->ordered_data_close = 0;
ei->orphan_meta_reserved = 0;
ei->dummy_inode = 0;
+ ei->in_defrag = 0;
ei->force_compress = BTRFS_COMPRESS_NONE;
+ ei->delayed_node = NULL;
+
inode = &ei->vfs_inode;
- extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
- extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
- extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+ extent_map_tree_init(&ei->extent_tree);
+ extent_io_tree_init(&ei->io_tree, &inode->i_data);
+ extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
@@ -6543,7 +6727,7 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(!list_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
- WARN_ON(BTRFS_I(inode)->reserved_extents);
+ WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
/*
* This can happen where we create an inode, but somebody else also
@@ -6581,8 +6765,8 @@ void btrfs_destroy_inode(struct inode *inode)
spin_lock(&root->orphan_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
- printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
- inode->i_ino);
+ printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
+ (unsigned long long)btrfs_ino(inode));
list_del_init(&BTRFS_I(inode)->i_orphan);
}
spin_unlock(&root->orphan_lock);
@@ -6604,6 +6788,7 @@ void btrfs_destroy_inode(struct inode *inode)
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
free:
+ btrfs_remove_delayed_node(inode);
call_rcu(&inode->i_rcu, btrfs_i_callback);
}
@@ -6612,7 +6797,7 @@ int btrfs_drop_inode(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
if (btrfs_root_refs(&root->root_item) == 0 &&
- root != root->fs_info->tree_root)
+ !is_free_space_inode(root, inode))
return 1;
else
return generic_drop_inode(inode);
@@ -6635,6 +6820,8 @@ void btrfs_destroy_cachep(void)
kmem_cache_destroy(btrfs_transaction_cachep);
if (btrfs_path_cachep)
kmem_cache_destroy(btrfs_path_cachep);
+ if (btrfs_free_space_cachep)
+ kmem_cache_destroy(btrfs_free_space_cachep);
}
int btrfs_init_cachep(void)
@@ -6663,6 +6850,12 @@ int btrfs_init_cachep(void)
if (!btrfs_path_cachep)
goto fail;
+ btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+ sizeof(struct btrfs_free_space), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ if (!btrfs_free_space_cachep)
+ goto fail;
+
return 0;
fail:
btrfs_destroy_cachep();
@@ -6681,6 +6874,26 @@ static int btrfs_getattr(struct vfsmount *mnt,
return 0;
}
+/*
+ * If a file is moved, it will inherit the cow and compression flags of the new
+ * directory.
+ */
+static void fixup_inode_flags(struct inode *dir, struct inode *inode)
+{
+ struct btrfs_inode *b_dir = BTRFS_I(dir);
+ struct btrfs_inode *b_inode = BTRFS_I(inode);
+
+ if (b_dir->flags & BTRFS_INODE_NODATACOW)
+ b_inode->flags |= BTRFS_INODE_NODATACOW;
+ else
+ b_inode->flags &= ~BTRFS_INODE_NODATACOW;
+
+ if (b_dir->flags & BTRFS_INODE_COMPRESS)
+ b_inode->flags |= BTRFS_INODE_COMPRESS;
+ else
+ b_inode->flags &= ~BTRFS_INODE_COMPRESS;
+}
+
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
@@ -6693,16 +6906,17 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
u64 index = 0;
u64 root_objectid;
int ret;
+ u64 old_ino = btrfs_ino(old_inode);
- if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
/* we only allow rename subvolume link between subvolumes */
- if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
- if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
- (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
+ if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+ (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
return -ENOTEMPTY;
if (S_ISDIR(old_inode->i_mode) && new_inode &&
@@ -6718,7 +6932,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
filemap_flush(old_inode->i_mapping);
/* close the racy window with snapshot create/destroy ioctl */
- if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&root->fs_info->subvol_sem);
/*
* We want to reserve the absolute worst case amount of items. So if
@@ -6729,8 +6943,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* should cover the worst case number of items we'll modify.
*/
trans = btrfs_start_transaction(root, 20);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out_notrans;
+ }
btrfs_set_trans_block_group(trans, new_dir);
@@ -6741,15 +6957,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret)
goto out_fail;
- if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
/* force full log commit if subvolume involved. */
root->fs_info->last_trans_log_full_commit = trans->transid;
} else {
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
- old_inode->i_ino,
- new_dir->i_ino, index);
+ old_ino,
+ btrfs_ino(new_dir), index);
if (ret)
goto out_fail;
/*
@@ -6765,10 +6981,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* make sure the inode gets flushed if it is replacing
* something.
*/
- if (new_inode && new_inode->i_size &&
- old_inode && S_ISREG(old_inode->i_mode)) {
+ if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
btrfs_add_ordered_operation(trans, root, old_inode);
- }
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
@@ -6777,23 +6991,24 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
- if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
old_dentry->d_name.name,
old_dentry->d_name.len);
} else {
- btrfs_inc_nlink(old_dentry->d_inode);
- ret = btrfs_unlink_inode(trans, root, old_dir,
- old_dentry->d_inode,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
+ ret = __btrfs_unlink_inode(trans, root, old_dir,
+ old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, old_inode);
}
BUG_ON(ret);
if (new_inode) {
new_inode->i_ctime = CURRENT_TIME;
- if (unlikely(new_inode->i_ino ==
+ if (unlikely(btrfs_ino(new_inode) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
root_objectid = BTRFS_I(new_inode)->location.objectid;
ret = btrfs_unlink_subvol(trans, dest, new_dir,
@@ -6814,12 +7029,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
+ fixup_inode_flags(new_dir, old_inode);
+
ret = btrfs_add_link(trans, new_dir, old_inode,
new_dentry->d_name.name,
new_dentry->d_name.len, 0, index);
BUG_ON(ret);
- if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
struct dentry *parent = dget_parent(new_dentry);
btrfs_log_new_name(trans, old_inode, old_dir, parent);
dput(parent);
@@ -6827,8 +7044,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
out_fail:
btrfs_end_transaction_throttle(trans, root);
-
- if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+out_notrans:
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
return ret;
@@ -6882,58 +7099,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
return 0;
}
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
- int sync)
-{
- struct btrfs_inode *binode;
- struct inode *inode = NULL;
-
- spin_lock(&root->fs_info->delalloc_lock);
- while (!list_empty(&root->fs_info->delalloc_inodes)) {
- binode = list_entry(root->fs_info->delalloc_inodes.next,
- struct btrfs_inode, delalloc_inodes);
- inode = igrab(&binode->vfs_inode);
- if (inode) {
- list_move_tail(&binode->delalloc_inodes,
- &root->fs_info->delalloc_inodes);
- break;
- }
-
- list_del_init(&binode->delalloc_inodes);
- cond_resched_lock(&root->fs_info->delalloc_lock);
- }
- spin_unlock(&root->fs_info->delalloc_lock);
-
- if (inode) {
- if (sync) {
- filemap_write_and_wait(inode->i_mapping);
- /*
- * We have to do this because compression doesn't
- * actually set PG_writeback until it submits the pages
- * for IO, which happens in an async thread, so we could
- * race and not actually wait for any writeback pages
- * because they've not been submitted yet. Technically
- * this could still be the case for the ordered stuff
- * since the async thread may not have started to do its
- * work yet. If this becomes the case then we need to
- * figure out a way to make sure that in writepage we
- * wait for any async pages to be submitted before
- * returning so that fdatawait does what its supposed to
- * do.
- */
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
- } else {
- filemap_flush(inode->i_mapping);
- }
- if (delay_iput)
- btrfs_add_delayed_iput(inode);
- else
- iput(inode);
- return 1;
- }
- return 0;
-}
-
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
@@ -6957,9 +7122,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
- err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
- if (err)
- return err;
/*
* 2 items for inode item and ref
* 2 items for dir items
@@ -6971,15 +7133,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
btrfs_set_trans_block_group(trans, dir);
+ err = btrfs_find_free_ino(root, &objectid);
+ if (err)
+ goto out_unlock;
+
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
- dentry->d_name.len, dir->i_ino, objectid,
+ dentry->d_name.len, btrfs_ino(dir), objectid,
BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
&index);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
goto out_unlock;
+ }
- err = btrfs_init_inode_security(trans, inode, dir);
+ err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err) {
drop_inode = 1;
goto out_unlock;
@@ -7003,7 +7170,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
path = btrfs_alloc_path();
BUG_ON(!path);
- key.objectid = inode->i_ino;
+ key.objectid = btrfs_ino(inode);
key.offset = 0;
btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
datasize = btrfs_file_extent_calc_inline_size(name_len);
@@ -7011,6 +7178,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
datasize);
if (err) {
drop_inode = 1;
+ btrfs_free_path(path);
goto out_unlock;
}
leaf = path->nodes[0];
@@ -7215,7 +7383,6 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readpages = btrfs_readpages,
- .sync_page = block_sync_page,
.direct_IO = btrfs_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
@@ -7231,7 +7398,6 @@ static const struct address_space_operations btrfs_symlink_aops = {
};
static const struct inode_operations btrfs_file_inode_operations = {
- .truncate = btrfs_truncate,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.setxattr = btrfs_setxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 02d224e8c83f..85e818ce00c5 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -40,6 +40,7 @@
#include <linux/xattr.h>
#include <linux/vmalloc.h>
#include <linux/slab.h>
+#include <linux/blkdev.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -49,6 +50,7 @@
#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
+#include "inode-map.h"
/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -80,6 +82,13 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
iflags |= FS_NOATIME_FL;
if (flags & BTRFS_INODE_DIRSYNC)
iflags |= FS_DIRSYNC_FL;
+ if (flags & BTRFS_INODE_NODATACOW)
+ iflags |= FS_NOCOW_FL;
+
+ if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
+ iflags |= FS_COMPR_FL;
+ else if (flags & BTRFS_INODE_NOCOMPRESS)
+ iflags |= FS_NOCOMP_FL;
return iflags;
}
@@ -138,6 +147,21 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
return 0;
}
+static int check_flags(unsigned int flags)
+{
+ if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+ FS_NOATIME_FL | FS_NODUMP_FL | \
+ FS_SYNC_FL | FS_DIRSYNC_FL | \
+ FS_NOCOMP_FL | FS_COMPR_FL |
+ FS_NOCOW_FL))
+ return -EOPNOTSUPP;
+
+ if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
+ return -EINVAL;
+
+ return 0;
+}
+
static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
{
struct inode *inode = file->f_path.dentry->d_inode;
@@ -153,12 +177,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (copy_from_user(&flags, arg, sizeof(flags)))
return -EFAULT;
- if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
- FS_NOATIME_FL | FS_NODUMP_FL | \
- FS_SYNC_FL | FS_DIRSYNC_FL))
- return -EOPNOTSUPP;
+ ret = check_flags(flags);
+ if (ret)
+ return ret;
- if (!is_owner_or_cap(inode))
+ if (!inode_owner_or_capable(inode))
return -EACCES;
mutex_lock(&inode->i_mutex);
@@ -200,7 +223,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
ip->flags |= BTRFS_INODE_DIRSYNC;
else
ip->flags &= ~BTRFS_INODE_DIRSYNC;
+ if (flags & FS_NOCOW_FL)
+ ip->flags |= BTRFS_INODE_NODATACOW;
+ else
+ ip->flags &= ~BTRFS_INODE_NODATACOW;
+ /*
+ * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
+ * flag may be changed automatically if compression code won't make
+ * things smaller.
+ */
+ if (flags & FS_NOCOMP_FL) {
+ ip->flags &= ~BTRFS_INODE_COMPRESS;
+ ip->flags |= BTRFS_INODE_NOCOMPRESS;
+ } else if (flags & FS_COMPR_FL) {
+ ip->flags |= BTRFS_INODE_COMPRESS;
+ ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ } else {
+ ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ }
trans = btrfs_join_transaction(root, 1);
BUG_ON(IS_ERR(trans));
@@ -213,9 +254,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
btrfs_end_transaction(trans, root);
mnt_drop_write(file->f_path.mnt);
+
+ ret = 0;
out_unlock:
mutex_unlock(&inode->i_mutex);
- return 0;
+ return ret;
}
static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
@@ -225,6 +268,50 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
return put_user(inode->i_generation, arg);
}
+static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_device *device;
+ struct request_queue *q;
+ struct fstrim_range range;
+ u64 minlen = ULLONG_MAX;
+ u64 num_devices = 0;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
+ dev_list) {
+ if (!device->bdev)
+ continue;
+ q = bdev_get_queue(device->bdev);
+ if (blk_queue_discard(q)) {
+ num_devices++;
+ minlen = min((u64)q->limits.discard_granularity,
+ minlen);
+ }
+ }
+ rcu_read_unlock();
+ if (!num_devices)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, arg, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max(range.minlen, minlen);
+ ret = btrfs_trim_fs(root, &range);
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(arg, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
static noinline int create_subvol(struct btrfs_root *root,
struct dentry *dentry,
char *name, int namelen,
@@ -244,8 +331,7 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
- ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
- 0, &objectid);
+ ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
if (ret) {
dput(parent);
return ret;
@@ -294,6 +380,10 @@ static noinline int create_subvol(struct btrfs_root *root,
inode_item->nbytes = cpu_to_le64(root->leafsize);
inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+ root_item.flags = 0;
+ root_item.byte_limit = 0;
+ inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
+
btrfs_set_root_bytenr(&root_item, leaf->start);
btrfs_set_root_generation(&root_item, trans->transid);
btrfs_set_root_level(&root_item, 0);
@@ -333,7 +423,7 @@ static noinline int create_subvol(struct btrfs_root *root,
BUG_ON(ret);
ret = btrfs_insert_dir_item(trans, root,
- name, namelen, dir->i_ino, &key,
+ name, namelen, dir, &key,
BTRFS_FT_DIR, index);
if (ret)
goto fail;
@@ -344,7 +434,7 @@ static noinline int create_subvol(struct btrfs_root *root,
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
objectid, root->root_key.objectid,
- dir->i_ino, index, name, namelen);
+ btrfs_ino(dir), index, name, namelen);
BUG_ON(ret);
@@ -409,7 +499,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (ret)
goto fail;
- btrfs_orphan_cleanup(pending_snapshot->snap);
+ ret = btrfs_orphan_cleanup(pending_snapshot->snap);
+ if (ret)
+ goto fail;
parent = dget_parent(dentry);
inode = btrfs_lookup_dentry(parent->d_inode, dentry);
@@ -564,6 +656,106 @@ out_unlock:
return error;
}
+/*
+ * When we're defragging a range, we don't want to kick it off again
+ * if it is really just waiting for delalloc to send it down.
+ * If we find a nice big extent or delalloc range for the bytes in the
+ * file you want to defrag, we return 0 to let you know to skip this
+ * part of the file
+ */
+static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
+{
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map *em = NULL;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 end;
+
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+ read_unlock(&em_tree->lock);
+
+ if (em) {
+ end = extent_map_end(em);
+ free_extent_map(em);
+ if (end - offset > thresh)
+ return 0;
+ }
+ /* if we already have a nice delalloc here, just stop */
+ thresh /= 2;
+ end = count_range_bits(io_tree, &offset, offset + thresh,
+ thresh, EXTENT_DELALLOC, 1);
+ if (end >= thresh)
+ return 0;
+ return 1;
+}
+
+/*
+ * helper function to walk through a file and find extents
+ * newer than a specific transid, and smaller than thresh.
+ *
+ * This is used by the defragging code to find new and small
+ * extents
+ */
+static int find_new_extents(struct btrfs_root *root,
+ struct inode *inode, u64 newer_than,
+ u64 *off, int thresh)
+{
+ struct btrfs_path *path;
+ struct btrfs_key min_key;
+ struct btrfs_key max_key;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *extent;
+ int type;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ min_key.objectid = inode->i_ino;
+ min_key.type = BTRFS_EXTENT_DATA_KEY;
+ min_key.offset = *off;
+
+ max_key.objectid = inode->i_ino;
+ max_key.type = (u8)-1;
+ max_key.offset = (u64)-1;
+
+ path->keep_locks = 1;
+
+ while(1) {
+ ret = btrfs_search_forward(root, &min_key, &max_key,
+ path, 0, newer_than);
+ if (ret != 0)
+ goto none;
+ if (min_key.objectid != inode->i_ino)
+ goto none;
+ if (min_key.type != BTRFS_EXTENT_DATA_KEY)
+ goto none;
+
+ leaf = path->nodes[0];
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
+
+ type = btrfs_file_extent_type(leaf, extent);
+ if (type == BTRFS_FILE_EXTENT_REG &&
+ btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
+ check_defrag_in_cache(inode, min_key.offset, thresh)) {
+ *off = min_key.offset;
+ btrfs_free_path(path);
+ return 0;
+ }
+
+ if (min_key.offset == (u64)-1)
+ goto none;
+
+ min_key.offset++;
+ btrfs_release_path(path);
+ }
+none:
+ btrfs_free_path(path);
+ return -ENOENT;
+}
+
static int should_defrag_range(struct inode *inode, u64 start, u64 len,
int thresh, u64 *last_len, u64 *skip,
u64 *defrag_end)
@@ -573,10 +765,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
int ret = 1;
-
- if (thresh == 0)
- thresh = 256 * 1024;
-
/*
* make sure that once we start defragging and extent, we keep on
* defragging it
@@ -635,27 +823,176 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
return ret;
}
-static int btrfs_defrag_file(struct file *file,
- struct btrfs_ioctl_defrag_range_args *range)
+/*
+ * it doesn't do much good to defrag one or two pages
+ * at a time. This pulls in a nice chunk of pages
+ * to COW and defrag.
+ *
+ * It also makes sure the delalloc code has enough
+ * dirty data to avoid making new small extents as part
+ * of the defrag
+ *
+ * It's a good idea to start RA on this range
+ * before calling this.
+ */
+static int cluster_pages_for_defrag(struct inode *inode,
+ struct page **pages,
+ unsigned long start_index,
+ int num_pages)
{
- struct inode *inode = fdentry(file)->d_inode;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ unsigned long file_end;
+ u64 isize = i_size_read(inode);
+ u64 page_start;
+ u64 page_end;
+ int ret;
+ int i;
+ int i_done;
struct btrfs_ordered_extent *ordered;
- struct page *page;
+ struct extent_state *cached_state = NULL;
+
+ if (isize == 0)
+ return 0;
+ file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+
+ ret = btrfs_delalloc_reserve_space(inode,
+ num_pages << PAGE_CACHE_SHIFT);
+ if (ret)
+ return ret;
+again:
+ ret = 0;
+ i_done = 0;
+
+ /* step one, lock all the pages */
+ for (i = 0; i < num_pages; i++) {
+ struct page *page;
+ page = grab_cache_page(inode->i_mapping,
+ start_index + i);
+ if (!page)
+ break;
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = -EIO;
+ break;
+ }
+ }
+ isize = i_size_read(inode);
+ file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (!isize || page->index > file_end ||
+ page->mapping != inode->i_mapping) {
+ /* whoops, we blew past eof, skip this page */
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+ pages[i] = page;
+ i_done++;
+ }
+ if (!i_done || ret)
+ goto out;
+
+ if (!(inode->i_sb->s_flags & MS_ACTIVE))
+ goto out;
+
+ /*
+ * so now we have a nice long stream of locked
+ * and up to date pages, lets wait on them
+ */
+ for (i = 0; i < i_done; i++)
+ wait_on_page_writeback(pages[i]);
+
+ page_start = page_offset(pages[0]);
+ page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1, 0, &cached_state,
+ GFP_NOFS);
+ ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
+ if (ordered &&
+ ordered->file_offset + ordered->len > page_start &&
+ ordered->file_offset < page_end) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1,
+ &cached_state, GFP_NOFS);
+ for (i = 0; i < i_done; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ btrfs_wait_ordered_range(inode, page_start,
+ page_end - page_start);
+ goto again;
+ }
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
+ page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
+ GFP_NOFS);
+
+ if (i_done != num_pages) {
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+ btrfs_delalloc_release_space(inode,
+ (num_pages - i_done) << PAGE_CACHE_SHIFT);
+ }
+
+
+ btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
+ &cached_state);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ page_start, page_end - 1, &cached_state,
+ GFP_NOFS);
+
+ for (i = 0; i < i_done; i++) {
+ clear_page_dirty_for_io(pages[i]);
+ ClearPageChecked(pages[i]);
+ set_page_extent_mapped(pages[i]);
+ set_page_dirty(pages[i]);
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ return i_done;
+out:
+ for (i = 0; i < i_done; i++) {
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ }
+ btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
+ return ret;
+
+}
+
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+ struct btrfs_ioctl_defrag_range_args *range,
+ u64 newer_than, unsigned long max_to_defrag)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_super_block *disk_super;
+ struct file_ra_state *ra = NULL;
unsigned long last_index;
- unsigned long ra_pages = root->fs_info->bdi.ra_pages;
- unsigned long total_read = 0;
u64 features;
- u64 page_start;
- u64 page_end;
u64 last_len = 0;
u64 skip = 0;
u64 defrag_end = 0;
+ u64 newer_off = range->start;
+ int newer_left = 0;
unsigned long i;
int ret;
+ int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int extent_thresh = range->extent_thresh;
+ int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ u64 new_align = ~((u64)128 * 1024 - 1);
+ struct page **pages = NULL;
+
+ if (extent_thresh == 0)
+ extent_thresh = 256 * 1024;
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
if (range->compress_type > BTRFS_COMPRESS_TYPES)
@@ -667,6 +1004,27 @@ static int btrfs_defrag_file(struct file *file,
if (inode->i_size == 0)
return 0;
+ /*
+ * if we were not given a file, allocate a readahead
+ * context
+ */
+ if (!file) {
+ ra = kzalloc(sizeof(*ra), GFP_NOFS);
+ if (!ra)
+ return -ENOMEM;
+ file_ra_state_init(ra, inode->i_mapping);
+ } else {
+ ra = &file->f_ra;
+ }
+
+ pages = kmalloc(sizeof(struct page *) * newer_cluster,
+ GFP_NOFS);
+ if (!pages) {
+ ret = -ENOMEM;
+ goto out_ra;
+ }
+
+ /* find the last page to defrag */
if (range->start + range->len > range->start) {
last_index = min_t(u64, inode->i_size - 1,
range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
@@ -674,11 +1032,37 @@ static int btrfs_defrag_file(struct file *file,
last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
}
- i = range->start >> PAGE_CACHE_SHIFT;
- while (i <= last_index) {
- if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+ if (newer_than) {
+ ret = find_new_extents(root, inode, newer_than,
+ &newer_off, 64 * 1024);
+ if (!ret) {
+ range->start = newer_off;
+ /*
+ * we always align our defrag to help keep
+ * the extents in the file evenly spaced
+ */
+ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ newer_left = newer_cluster;
+ } else
+ goto out_ra;
+ } else {
+ i = range->start >> PAGE_CACHE_SHIFT;
+ }
+ if (!max_to_defrag)
+ max_to_defrag = last_index - 1;
+
+ while (i <= last_index && defrag_count < max_to_defrag) {
+ /*
+ * make sure we stop running if someone unmounts
+ * the FS
+ */
+ if (!(inode->i_sb->s_flags & MS_ACTIVE))
+ break;
+
+ if (!newer_than &&
+ !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
PAGE_CACHE_SIZE,
- range->extent_thresh,
+ extent_thresh,
&last_len, &skip,
&defrag_end)) {
unsigned long next;
@@ -690,92 +1074,39 @@ static int btrfs_defrag_file(struct file *file,
i = max(i + 1, next);
continue;
}
-
- if (total_read % ra_pages == 0) {
- btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
- min(last_index, i + ra_pages - 1));
- }
- total_read++;
- mutex_lock(&inode->i_mutex);
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = compress_type;
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
- if (ret)
- goto err_unlock;
-again:
- if (inode->i_size == 0 ||
- i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
- ret = 0;
- goto err_reservations;
- }
-
- page = grab_cache_page(inode->i_mapping, i);
- if (!page) {
- ret = -ENOMEM;
- goto err_reservations;
- }
-
- if (!PageUptodate(page)) {
- btrfs_readpage(NULL, page);
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- page_cache_release(page);
- ret = -EIO;
- goto err_reservations;
- }
- }
+ btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
- page_cache_release(page);
- goto again;
- }
-
- wait_on_page_writeback(page);
+ ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+ if (ret < 0)
+ goto out_ra;
- if (PageDirty(page)) {
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
- goto loop_unlock;
- }
+ defrag_count += ret;
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+ i += ret;
- page_start = (u64)page->index << PAGE_CACHE_SHIFT;
- page_end = page_start + PAGE_CACHE_SIZE - 1;
- lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ if (newer_than) {
+ if (newer_off == (u64)-1)
+ break;
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
- unlock_page(page);
- page_cache_release(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- goto again;
+ newer_off = max(newer_off + 1,
+ (u64)i << PAGE_CACHE_SHIFT);
+
+ ret = find_new_extents(root, inode,
+ newer_than, &newer_off,
+ 64 * 1024);
+ if (!ret) {
+ range->start = newer_off;
+ i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+ newer_left = newer_cluster;
+ } else {
+ break;
+ }
+ } else {
+ i++;
}
- set_page_extent_mapped(page);
-
- /*
- * this makes sure page_mkwrite is called on the
- * page if it is dirtied again later
- */
- clear_page_dirty_for_io(page);
- clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
- page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, GFP_NOFS);
-
- btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
- ClearPageChecked(page);
- set_page_dirty(page);
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-loop_unlock:
- unlock_page(page);
- page_cache_release(page);
- mutex_unlock(&inode->i_mutex);
-
- balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
- i++;
}
if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
@@ -807,12 +1138,14 @@ loop_unlock:
btrfs_set_super_incompat_flags(disk_super, features);
}
- return 0;
+ if (!file)
+ kfree(ra);
+ return defrag_count;
-err_reservations:
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-err_unlock:
- mutex_unlock(&inode->i_mutex);
+out_ra:
+ if (!file)
+ kfree(ra);
+ kfree(pages);
return ret;
}
@@ -1038,7 +1371,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
int ret = 0;
u64 flags = 0;
- if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
down_read(&root->fs_info->subvol_sem);
@@ -1065,18 +1398,21 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
if (root->fs_info->sb->s_flags & MS_RDONLY)
return -EROFS;
- if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
if (copy_from_user(&flags, arg, sizeof(flags)))
return -EFAULT;
- if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+ if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
return -EINVAL;
if (flags & ~BTRFS_SUBVOL_RDONLY)
return -EOPNOTSUPP;
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
down_write(&root->fs_info->subvol_sem);
/* nothing to do */
@@ -1097,7 +1433,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
goto out_reset;
}
- ret = btrfs_update_root(trans, root,
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
&root->root_key, &root->root_item);
btrfs_commit_transaction(trans, root);
@@ -1185,7 +1521,6 @@ static noinline int copy_to_sk(struct btrfs_root *root,
int nritems;
int i;
int slot;
- int found = 0;
int ret = 0;
leaf = path->nodes[0];
@@ -1232,7 +1567,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
item_off, item_len);
*sk_offset += item_len;
}
- found++;
+ (*num_found)++;
if (*num_found >= sk->nr_items)
break;
@@ -1251,7 +1586,6 @@ advance_key:
} else
ret = 1;
overflow:
- *num_found += found;
return ret;
}
@@ -1308,7 +1642,7 @@ static noinline int search_ioctl(struct inode *inode,
}
ret = copy_to_sk(root, path, &key, sk, args->buf,
&sk_offset, &num_found);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (ret || num_found >= sk->nr_items)
break;
@@ -1415,7 +1749,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
break;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
key.objectid = key.offset;
key.offset = (u64)-1;
dirid = key.objectid;
@@ -1545,7 +1879,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out_dput;
}
@@ -1663,7 +1997,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range->len = (u64)-1;
}
- ret = btrfs_defrag_file(file, range);
+ ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
+ range, 0, 0);
+ if (ret > 0)
+ ret = 0;
kfree(range);
break;
default:
@@ -1715,6 +2052,75 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
return ret;
}
+static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_fs_info_args fi_args;
+ struct btrfs_device *device;
+ struct btrfs_device *next;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ fi_args.num_devices = fs_devices->num_devices;
+ fi_args.max_id = 0;
+ memcpy(&fi_args.fsid, root->fs_info->fsid, sizeof(fi_args.fsid));
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+ if (device->devid > fi_args.max_id)
+ fi_args.max_id = device->devid;
+ }
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (copy_to_user(arg, &fi_args, sizeof(fi_args)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
+{
+ struct btrfs_ioctl_dev_info_args *di_args;
+ struct btrfs_device *dev;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ int ret = 0;
+ char *s_uuid = NULL;
+ char empty_uuid[BTRFS_UUID_SIZE] = {0};
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ di_args = memdup_user(arg, sizeof(*di_args));
+ if (IS_ERR(di_args))
+ return PTR_ERR(di_args);
+
+ if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0)
+ s_uuid = di_args->uuid;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ di_args->devid = dev->devid;
+ di_args->bytes_used = dev->bytes_used;
+ di_args->total_bytes = dev->total_bytes;
+ memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
+ strncpy(di_args->path, dev->name, sizeof(di_args->path));
+
+out:
+ if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
+ ret = -EFAULT;
+
+ kfree(di_args);
+ return ret;
+}
+
static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
{
@@ -1831,7 +2237,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
}
/* clone data */
- key.objectid = src->i_ino;
+ key.objectid = btrfs_ino(src);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
@@ -1858,7 +2264,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_item_key_to_cpu(leaf, &key, slot);
if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
- key.objectid != src->i_ino)
+ key.objectid != btrfs_ino(src))
break;
if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
@@ -1894,14 +2300,14 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
datal = btrfs_file_extent_ram_bytes(leaf,
extent);
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (key.offset + datal <= off ||
key.offset >= off+len)
goto next;
memcpy(&new_key, &key, sizeof(new_key));
- new_key.objectid = inode->i_ino;
+ new_key.objectid = btrfs_ino(inode);
if (off <= key.offset)
new_key.offset = key.offset + destoff - off;
else
@@ -1955,7 +2361,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
ret = btrfs_inc_extent_ref(trans, root,
disko, diskl, 0,
root->root_key.objectid,
- inode->i_ino,
+ btrfs_ino(inode),
new_key.offset - datao);
BUG_ON(ret);
}
@@ -2004,7 +2410,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
}
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -2025,12 +2431,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_end_transaction(trans, root);
}
next:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
key.offset++;
}
ret = 0;
out:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
out_unlock:
mutex_unlock(&src->i_mutex);
@@ -2199,7 +2605,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
struct btrfs_ioctl_space_info space;
struct btrfs_ioctl_space_info *dest;
struct btrfs_ioctl_space_info *dest_orig;
- struct btrfs_ioctl_space_info *user_dest;
+ struct btrfs_ioctl_space_info __user *user_dest;
struct btrfs_space_info *info;
u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
BTRFS_BLOCK_GROUP_SYSTEM,
@@ -2208,7 +2614,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
int num_types = 4;
int alloc_size;
int ret = 0;
- int slot_count = 0;
+ u64 slot_count = 0;
int i, c;
if (copy_from_user(&space_args,
@@ -2247,7 +2653,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
goto out;
}
- slot_count = min_t(int, space_args.space_slots, slot_count);
+ slot_count = min_t(u64, space_args.space_slots, slot_count);
alloc_size = sizeof(*dest) * slot_count;
@@ -2267,6 +2673,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
for (i = 0; i < num_types; i++) {
struct btrfs_space_info *tmp;
+ if (!slot_count)
+ break;
+
info = NULL;
rcu_read_lock();
list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
@@ -2288,7 +2697,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
memcpy(dest, &space, sizeof(space));
dest++;
space_args.total_spaces++;
+ slot_count--;
}
+ if (!slot_count)
+ break;
}
up_read(&info->groups_sem);
}
@@ -2339,12 +2751,17 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
struct btrfs_trans_handle *trans;
u64 transid;
+ int ret;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans))
return PTR_ERR(trans);
transid = trans->transid;
- btrfs_commit_transaction_async(trans, root, 0);
+ ret = btrfs_commit_transaction_async(trans, root, 0);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
if (argp)
if (copy_to_user(argp, &transid, sizeof(transid)))
@@ -2366,6 +2783,58 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
return btrfs_wait_for_commit(root, transid);
}
+static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+{
+ int ret;
+ struct btrfs_ioctl_scrub_args *sa;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
+ &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+
+ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ kfree(sa);
+ return ret;
+}
+
+static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btrfs_scrub_cancel(root);
+}
+
+static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
+ void __user *arg)
+{
+ struct btrfs_ioctl_scrub_args *sa;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa))
+ return PTR_ERR(sa);
+
+ ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
+
+ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ kfree(sa);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -2379,6 +2848,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_setflags(file, argp);
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
+ case FITRIM:
+ return btrfs_ioctl_fitrim(file, argp);
case BTRFS_IOC_SNAP_CREATE:
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SNAP_CREATE_V2:
@@ -2403,6 +2874,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_add_dev(root, argp);
case BTRFS_IOC_RM_DEV:
return btrfs_ioctl_rm_dev(root, argp);
+ case BTRFS_IOC_FS_INFO:
+ return btrfs_ioctl_fs_info(root, argp);
+ case BTRFS_IOC_DEV_INFO:
+ return btrfs_ioctl_dev_info(root, argp);
case BTRFS_IOC_BALANCE:
return btrfs_balance(root->fs_info->dev_root);
case BTRFS_IOC_CLONE:
@@ -2426,6 +2901,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_start_sync(file, argp);
case BTRFS_IOC_WAIT_SYNC:
return btrfs_ioctl_wait_sync(file, argp);
+ case BTRFS_IOC_SCRUB:
+ return btrfs_ioctl_scrub(root, argp);
+ case BTRFS_IOC_SCRUB_CANCEL:
+ return btrfs_ioctl_scrub_cancel(root, argp);
+ case BTRFS_IOC_SCRUB_PROGRESS:
+ return btrfs_ioctl_scrub_progress(root, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 8fb382167b13..ad1ea789fcb4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -32,6 +32,8 @@ struct btrfs_ioctl_vol_args {
#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
+#define BTRFS_FSID_SIZE 16
+#define BTRFS_UUID_SIZE 16
#define BTRFS_SUBVOL_NAME_MAX 4039
struct btrfs_ioctl_vol_args_v2 {
@@ -42,6 +44,71 @@ struct btrfs_ioctl_vol_args_v2 {
char name[BTRFS_SUBVOL_NAME_MAX + 1];
};
+/*
+ * structure to report errors and progress to userspace, either as a
+ * result of a finished scrub, a canceled scrub or a progress inquiry
+ */
+struct btrfs_scrub_progress {
+ __u64 data_extents_scrubbed; /* # of data extents scrubbed */
+ __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
+ __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
+ __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
+ __u64 read_errors; /* # of read errors encountered (EIO) */
+ __u64 csum_errors; /* # of failed csum checks */
+ __u64 verify_errors; /* # of occurences, where the metadata
+ * of a tree block did not match the
+ * expected values, like generation or
+ * logical */
+ __u64 no_csum; /* # of 4k data block for which no csum
+ * is present, probably the result of
+ * data written with nodatasum */
+ __u64 csum_discards; /* # of csum for which no data was found
+ * in the extent tree. */
+ __u64 super_errors; /* # of bad super blocks encountered */
+ __u64 malloc_errors; /* # of internal kmalloc errors. These
+ * will likely cause an incomplete
+ * scrub */
+ __u64 uncorrectable_errors; /* # of errors where either no intact
+ * copy was found or the writeback
+ * failed */
+ __u64 corrected_errors; /* # of errors corrected */
+ __u64 last_physical; /* last physical address scrubbed. In
+ * case a scrub was aborted, this can
+ * be used to restart the scrub */
+ __u64 unverified_errors; /* # of occurences where a read for a
+ * full (64k) bio failed, but the re-
+ * check succeeded for each 4k piece.
+ * Intermittent error. */
+};
+
+#define BTRFS_SCRUB_READONLY 1
+struct btrfs_ioctl_scrub_args {
+ __u64 devid; /* in */
+ __u64 start; /* in */
+ __u64 end; /* in */
+ __u64 flags; /* in */
+ struct btrfs_scrub_progress progress; /* out */
+ /* pad to 1k */
+ __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
+};
+
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+struct btrfs_ioctl_dev_info_args {
+ __u64 devid; /* in/out */
+ __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
+ __u64 bytes_used; /* out */
+ __u64 total_bytes; /* out */
+ __u64 unused[379]; /* pad to 4k */
+ __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */
+};
+
+struct btrfs_ioctl_fs_info_args {
+ __u64 max_id; /* out */
+ __u64 num_devices; /* out */
+ __u8 fsid[BTRFS_FSID_SIZE]; /* out */
+ __u64 reserved[124]; /* pad to 1k */
+};
+
#define BTRFS_INO_LOOKUP_PATH_MAX 4080
struct btrfs_ioctl_ino_lookup_args {
__u64 treeid;
@@ -114,37 +181,6 @@ struct btrfs_ioctl_clone_range_args {
#define BTRFS_DEFRAG_RANGE_COMPRESS 1
#define BTRFS_DEFRAG_RANGE_START_IO 2
-struct btrfs_ioctl_defrag_range_args {
- /* start of the defrag operation */
- __u64 start;
-
- /* number of bytes to defrag, use (u64)-1 to say all */
- __u64 len;
-
- /*
- * flags for the operation, which can include turning
- * on compression for this one defrag
- */
- __u64 flags;
-
- /*
- * any extent bigger than this will be considered
- * already defragged. Use 0 to take the kernel default
- * Use 1 to say every single extent must be rewritten
- */
- __u32 extent_thresh;
-
- /*
- * which compression method to use if turning on compression
- * for this defrag operation. If unspecified, zlib will
- * be used
- */
- __u32 compress_type;
-
- /* spare for later */
- __u32 unused[4];
-};
-
struct btrfs_ioctl_space_info {
__u64 flags;
__u64 total_bytes;
@@ -203,4 +239,13 @@ struct btrfs_ioctl_space_args {
struct btrfs_ioctl_vol_args_v2)
#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
+#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
+ struct btrfs_ioctl_scrub_args)
+#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
+#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
+ struct btrfs_ioctl_scrub_args)
+#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
+ struct btrfs_ioctl_dev_info_args)
+#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
+ struct btrfs_ioctl_fs_info_args)
#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6151f2ea38bb..66fa43dc3f0f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -185,31 +185,6 @@ sleep:
return 0;
}
-/*
- * Very quick trylock, this does not spin or schedule. It returns
- * 1 with the spinlock held if it was able to take the lock, or it
- * returns zero if it was unable to take the lock.
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
- */
-int btrfs_try_tree_lock(struct extent_buffer *eb)
-{
- if (spin_trylock(&eb->lock)) {
- if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
- /*
- * we've got the spinlock, but the real owner is
- * blocking. Drop the spinlock and return failure
- */
- spin_unlock(&eb->lock);
- return 0;
- }
- return 1;
- }
- /* someone else has the spinlock giveup */
- return 0;
-}
-
int btrfs_tree_unlock(struct extent_buffer *eb)
{
/*
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6c4ce457168c..5c33a560a2f1 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,8 +21,6 @@
int btrfs_tree_lock(struct extent_buffer *eb);
int btrfs_tree_unlock(struct extent_buffer *eb);
-
-int btrfs_try_tree_lock(struct extent_buffer *eb);
int btrfs_try_spin_lock(struct extent_buffer *eb);
void btrfs_set_lock_blocking(struct extent_buffer *eb);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cc9b450399df..a178f5ebea78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
unsigned long tot_out;
unsigned long tot_len;
char *buf;
+ bool may_late_unmap, need_unmap;
data_in = kmap(pages_in[0]);
tot_len = read_compress_length(data_in);
@@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws,
tot_in += in_len;
working_bytes = in_len;
+ may_late_unmap = need_unmap = false;
/* fast path: avoid using the working buffer */
if (in_page_bytes_left >= in_len) {
buf = data_in + in_offset;
bytes = in_len;
+ may_late_unmap = true;
goto cont;
}
@@ -329,14 +332,17 @@ cont:
if (working_bytes == 0 && tot_in >= tot_len)
break;
- kunmap(pages_in[page_in_index]);
- page_in_index++;
- if (page_in_index >= total_pages_in) {
+ if (page_in_index + 1 >= total_pages_in) {
ret = -1;
- data_in = NULL;
goto done;
}
- data_in = kmap(pages_in[page_in_index]);
+
+ if (may_late_unmap)
+ need_unmap = true;
+ else
+ kunmap(pages_in[page_in_index]);
+
+ data_in = kmap(pages_in[++page_in_index]);
in_page_bytes_left = PAGE_CACHE_SIZE;
in_offset = 0;
@@ -346,6 +352,8 @@ cont:
out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
&out_len);
+ if (need_unmap)
+ kunmap(pages_in[page_in_index - 1]);
if (ret != LZO_E_OK) {
printk(KERN_WARNING "btrfs decompress failed\n");
ret = -1;
@@ -363,8 +371,7 @@ cont:
break;
}
done:
- if (data_in)
- kunmap(pages_in[page_in_index]);
+ kunmap(pages_in[page_in_index]);
return ret;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 083a55477375..a1c940425307 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -202,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->list);
INIT_LIST_HEAD(&entry->root_extent_list);
+ trace_btrfs_ordered_extent_add(inode, entry);
+
spin_lock(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
&entry->rb_node);
@@ -387,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
+ trace_btrfs_ordered_extent_put(entry->inode, entry);
+
if (atomic_dec_and_test(&entry->refs)) {
while (!list_empty(&entry->list)) {
cur = entry->list.next;
@@ -420,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
spin_lock(&root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
+ trace_btrfs_ordered_extent_remove(inode, entry);
+
/*
* we have no more ordered extents for this inode and
* no dirty pages. We can safely remove it from the
@@ -585,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
u64 start = entry->file_offset;
u64 end = start + entry->len - 1;
+ trace_btrfs_ordered_extent_start(inode, entry);
+
/*
* pages in the range can be dirty, clean or writeback. We
* start IO on any dirty ones so the wait doesn't stall waiting
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index a97314cf6bd6..82d569cb6267 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -23,56 +23,6 @@
#include "ref-cache.h"
#include "transaction.h"
-/*
- * leaf refs are used to cache the information about which extents
- * a given leaf has references on. This allows us to process that leaf
- * in btrfs_drop_snapshot without needing to read it back from disk.
- */
-
-/*
- * kmalloc a leaf reference struct and update the counters for the
- * total ref cache size
- */
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
- int nr_extents)
-{
- struct btrfs_leaf_ref *ref;
- size_t size = btrfs_leaf_ref_size(nr_extents);
-
- ref = kmalloc(size, GFP_NOFS);
- if (ref) {
- spin_lock(&root->fs_info->ref_cache_lock);
- root->fs_info->total_ref_cache_size += size;
- spin_unlock(&root->fs_info->ref_cache_lock);
-
- memset(ref, 0, sizeof(*ref));
- atomic_set(&ref->usage, 1);
- INIT_LIST_HEAD(&ref->list);
- }
- return ref;
-}
-
-/*
- * free a leaf reference struct and update the counters for the
- * total ref cache size
- */
-void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
-{
- if (!ref)
- return;
- WARN_ON(atomic_read(&ref->usage) == 0);
- if (atomic_dec_and_test(&ref->usage)) {
- size_t size = btrfs_leaf_ref_size(ref->nritems);
-
- BUG_ON(ref->in_tree);
- kfree(ref);
-
- spin_lock(&root->fs_info->ref_cache_lock);
- root->fs_info->total_ref_cache_size -= size;
- spin_unlock(&root->fs_info->ref_cache_lock);
- }
-}
-
static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
struct rb_node *node)
{
@@ -116,117 +66,3 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
}
return NULL;
}
-
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
- int shared)
-{
- struct btrfs_leaf_ref *ref = NULL;
- struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-
- if (shared)
- tree = &root->fs_info->shared_ref_tree;
- if (!tree)
- return 0;
-
- spin_lock(&tree->lock);
- while (!list_empty(&tree->list)) {
- ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
- BUG_ON(ref->tree != tree);
- if (ref->root_gen > max_root_gen)
- break;
- if (!xchg(&ref->in_tree, 0)) {
- cond_resched_lock(&tree->lock);
- continue;
- }
-
- rb_erase(&ref->rb_node, &tree->root);
- list_del_init(&ref->list);
-
- spin_unlock(&tree->lock);
- btrfs_free_leaf_ref(root, ref);
- cond_resched();
- spin_lock(&tree->lock);
- }
- spin_unlock(&tree->lock);
- return 0;
-}
-
-/*
- * find the leaf ref for a given extent. This returns the ref struct with
- * a usage reference incremented
- */
-struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
- u64 bytenr)
-{
- struct rb_node *rb;
- struct btrfs_leaf_ref *ref = NULL;
- struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-again:
- if (tree) {
- spin_lock(&tree->lock);
- rb = tree_search(&tree->root, bytenr);
- if (rb)
- ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
- if (ref)
- atomic_inc(&ref->usage);
- spin_unlock(&tree->lock);
- if (ref)
- return ref;
- }
- if (tree != &root->fs_info->shared_ref_tree) {
- tree = &root->fs_info->shared_ref_tree;
- goto again;
- }
- return NULL;
-}
-
-/*
- * add a fully filled in leaf ref struct
- * remove all the refs older than a given root generation
- */
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
- int shared)
-{
- int ret = 0;
- struct rb_node *rb;
- struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-
- if (shared)
- tree = &root->fs_info->shared_ref_tree;
-
- spin_lock(&tree->lock);
- rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
- if (rb) {
- ret = -EEXIST;
- } else {
- atomic_inc(&ref->usage);
- ref->tree = tree;
- ref->in_tree = 1;
- list_add_tail(&ref->list, &tree->list);
- }
- spin_unlock(&tree->lock);
- return ret;
-}
-
-/*
- * remove a single leaf ref from the tree. This drops the ref held by the tree
- * only
- */
-int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
-{
- struct btrfs_leaf_ref_tree *tree;
-
- if (!xchg(&ref->in_tree, 0))
- return 0;
-
- tree = ref->tree;
- spin_lock(&tree->lock);
-
- rb_erase(&ref->rb_node, &tree->root);
- list_del_init(&ref->list);
-
- spin_unlock(&tree->lock);
-
- btrfs_free_leaf_ref(root, ref);
- return 0;
-}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index e2a55cb2072b..24f7001f6387 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -49,28 +49,4 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
return sizeof(struct btrfs_leaf_ref) +
sizeof(struct btrfs_extent_info) * nr_extents;
}
-
-static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
-{
- tree->root = RB_ROOT;
- INIT_LIST_HEAD(&tree->list);
- spin_lock_init(&tree->lock);
-}
-
-static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
-{
- return RB_EMPTY_ROOT(&tree->root);
-}
-
-void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
- int nr_extents);
-void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
- u64 bytenr);
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
- int shared);
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
- int shared);
-int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 1f5556acb530..ca38eca70af0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -30,6 +30,7 @@
#include "btrfs_inode.h"
#include "async-thread.h"
#include "free-space-cache.h"
+#include "inode-map.h"
/*
* backref_node, mapping_node and tree_block start with this
@@ -507,6 +508,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
return 1;
}
+
static int should_ignore_root(struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
@@ -529,7 +531,6 @@ static int should_ignore_root(struct btrfs_root *root)
*/
return 1;
}
-
/*
* find reloc tree by address of tree root
*/
@@ -709,7 +710,7 @@ again:
WARN_ON(cur->checked);
if (!list_empty(&cur->upper)) {
/*
- * the backref was added previously when processsing
+ * the backref was added previously when processing
* backref of type BTRFS_TREE_BLOCK_REF_KEY
*/
BUG_ON(!list_is_singular(&cur->upper));
@@ -961,7 +962,7 @@ again:
lower = upper;
upper = NULL;
}
- btrfs_release_path(root, path2);
+ btrfs_release_path(path2);
next:
if (ptr < end) {
ptr += btrfs_extent_inline_ref_size(key.type);
@@ -974,7 +975,7 @@ next:
if (ptr >= end)
path1->slots[0]++;
}
- btrfs_release_path(rc->extent_root, path1);
+ btrfs_release_path(path1);
cur->checked = 1;
WARN_ON(exist);
@@ -1157,6 +1158,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
new_node->bytenr = dest->node->start;
new_node->level = node->level;
new_node->lowest = node->lowest;
+ new_node->checked = 1;
new_node->root = dest;
if (!node->lowest) {
@@ -1408,9 +1410,9 @@ again:
prev = node;
entry = rb_entry(node, struct btrfs_inode, rb_node);
- if (objectid < entry->vfs_inode.i_ino)
+ if (objectid < btrfs_ino(&entry->vfs_inode))
node = node->rb_left;
- else if (objectid > entry->vfs_inode.i_ino)
+ else if (objectid > btrfs_ino(&entry->vfs_inode))
node = node->rb_right;
else
break;
@@ -1418,7 +1420,7 @@ again:
if (!node) {
while (prev) {
entry = rb_entry(prev, struct btrfs_inode, rb_node);
- if (objectid <= entry->vfs_inode.i_ino) {
+ if (objectid <= btrfs_ino(&entry->vfs_inode)) {
node = prev;
break;
}
@@ -1433,7 +1435,7 @@ again:
return inode;
}
- objectid = entry->vfs_inode.i_ino + 1;
+ objectid = btrfs_ino(&entry->vfs_inode) + 1;
if (cond_resched_lock(&root->inode_lock))
goto again;
@@ -1469,7 +1471,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
return -ENOMEM;
bytenr -= BTRFS_I(reloc_inode)->index_cnt;
- ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
bytenr, 0);
if (ret < 0)
goto out;
@@ -1557,11 +1559,11 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (first) {
inode = find_next_inode(root, key.objectid);
first = 0;
- } else if (inode && inode->i_ino < key.objectid) {
+ } else if (inode && btrfs_ino(inode) < key.objectid) {
btrfs_add_delayed_iput(inode);
inode = find_next_inode(root, key.objectid);
}
- if (inode && inode->i_ino == key.objectid) {
+ if (inode && btrfs_ino(inode) == key.objectid) {
end = key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
WARN_ON(!IS_ALIGNED(key.offset,
@@ -1723,6 +1725,7 @@ again:
eb = read_tree_block(dest, old_bytenr, blocksize,
old_ptr_gen);
+ BUG_ON(!eb);
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -1747,7 +1750,7 @@ again:
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
- btrfs_release_path(src, path);
+ btrfs_release_path(path);
path->lowest_level = level;
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
@@ -1891,6 +1894,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
struct inode *inode = NULL;
u64 objectid;
u64 start, end;
+ u64 ino;
objectid = min_key->objectid;
while (1) {
@@ -1903,17 +1907,18 @@ static int invalidate_extent_cache(struct btrfs_root *root,
inode = find_next_inode(root, objectid);
if (!inode)
break;
+ ino = btrfs_ino(inode);
- if (inode->i_ino > max_key->objectid) {
+ if (ino > max_key->objectid) {
iput(inode);
break;
}
- objectid = inode->i_ino + 1;
+ objectid = ino + 1;
if (!S_ISREG(inode->i_mode))
continue;
- if (unlikely(min_key->objectid == inode->i_ino)) {
+ if (unlikely(min_key->objectid == ino)) {
if (min_key->type > BTRFS_EXTENT_DATA_KEY)
continue;
if (min_key->type < BTRFS_EXTENT_DATA_KEY)
@@ -1926,7 +1931,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
start = 0;
}
- if (unlikely(max_key->objectid == inode->i_ino)) {
+ if (unlikely(max_key->objectid == ino)) {
if (max_key->type < BTRFS_EXTENT_DATA_KEY)
continue;
if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
@@ -2344,7 +2349,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
root = next->root;
BUG_ON(!root);
- /* no other choice for non-refernce counted tree */
+ /* no other choice for non-references counted tree */
if (!root->ref_cows)
return root;
@@ -2494,7 +2499,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
path->locks[upper->level] = 0;
slot = path->slots[upper->level];
- btrfs_release_path(NULL, path);
+ btrfs_release_path(path);
} else {
ret = btrfs_bin_search(upper->eb, key, upper->level,
&slot);
@@ -2512,6 +2517,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
blocksize = btrfs_level_size(root, node->level);
generation = btrfs_node_ptr_generation(upper->eb, slot);
eb = read_tree_block(root, bytenr, blocksize, generation);
+ if (!eb) {
+ err = -EIO;
+ goto next;
+ }
btrfs_tree_lock(eb);
btrfs_set_lock_blocking(eb);
@@ -2669,6 +2678,7 @@ static int get_tree_block_key(struct reloc_control *rc,
BUG_ON(block->key_ready);
eb = read_tree_block(rc->extent_root, block->bytenr,
block->key.objectid, block->key.offset);
+ BUG_ON(!eb);
WARN_ON(btrfs_header_level(eb) != block->level);
if (block->level == 0)
btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -2730,7 +2740,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
} else {
path->lowest_level = node->level;
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (ret > 0)
ret = 0;
}
@@ -2863,7 +2873,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
struct extent_map *em;
int ret = 0;
- em = alloc_extent_map(GFP_NOFS);
+ em = alloc_extent_map();
if (!em)
return -ENOMEM;
@@ -3112,7 +3122,7 @@ static int add_tree_block(struct reloc_control *rc,
#endif
}
- btrfs_release_path(rc->extent_root, path);
+ btrfs_release_path(path);
BUG_ON(level == -1);
@@ -3213,7 +3223,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
key.offset = 0;
inode = btrfs_iget(fs_info->sb, &key, root, NULL);
- if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
+ if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
if (inode && !IS_ERR(inode))
iput(inode);
return -ENOENT;
@@ -3498,7 +3508,7 @@ int add_data_references(struct reloc_control *rc,
}
path->slots[0]++;
}
- btrfs_release_path(rc->extent_root, path);
+ btrfs_release_path(path);
if (err)
free_block_list(blocks);
return err;
@@ -3561,7 +3571,7 @@ next:
EXTENT_DIRTY);
if (ret == 0 && start <= key.objectid) {
- btrfs_release_path(rc->extent_root, path);
+ btrfs_release_path(path);
rc->search_start = end + 1;
} else {
rc->search_start = key.objectid + key.offset;
@@ -3569,7 +3579,7 @@ next:
return 0;
}
}
- btrfs_release_path(rc->extent_root, path);
+ btrfs_release_path(path);
return ret;
}
@@ -3653,6 +3663,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
u32 item_size;
int ret;
int err = 0;
+ int progress = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3665,9 +3676,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
while (1) {
+ progress++;
trans = btrfs_start_transaction(rc->extent_root, 0);
BUG_ON(IS_ERR(trans));
-
+restart:
if (update_backref_cache(trans, &rc->backref_cache)) {
btrfs_end_transaction(trans, rc->extent_root);
continue;
@@ -3704,7 +3716,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
flags = BTRFS_EXTENT_FLAG_DATA;
if (path_change) {
- btrfs_release_path(rc->extent_root, path);
+ btrfs_release_path(path);
path->search_commit_root = 1;
path->skip_locking = 1;
@@ -3727,7 +3739,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
(flags & BTRFS_EXTENT_FLAG_DATA)) {
ret = add_data_references(rc, &key, path, &blocks);
} else {
- btrfs_release_path(rc->extent_root, path);
+ btrfs_release_path(path);
ret = 0;
}
if (ret < 0) {
@@ -3780,8 +3792,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
}
}
+ if (trans && progress && err == -ENOSPC) {
+ ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+ rc->block_group->flags);
+ if (ret == 0) {
+ err = 0;
+ progress = 0;
+ goto restart;
+ }
+ }
- btrfs_release_path(rc->extent_root, path);
+ btrfs_release_path(path);
clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
GFP_NOFS);
@@ -3849,7 +3870,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
out:
btrfs_free_path(path);
return ret;
@@ -3879,7 +3900,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
if (IS_ERR(trans))
return ERR_CAST(trans);
- err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out;
@@ -3917,7 +3938,7 @@ static struct reloc_control *alloc_reloc_control(void)
INIT_LIST_HEAD(&rc->reloc_roots);
backref_cache_init(&rc->backref_cache);
mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+ extent_io_tree_init(&rc->processed_blocks, NULL);
return rc;
}
@@ -4091,7 +4112,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
}
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- btrfs_release_path(root->fs_info->tree_root, path);
+ btrfs_release_path(path);
if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
key.type != BTRFS_ROOT_ITEM_KEY)
@@ -4123,7 +4144,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.offset--;
}
- btrfs_release_path(root->fs_info->tree_root, path);
+ btrfs_release_path(path);
if (list_empty(&reloc_roots))
goto out;
@@ -4197,7 +4218,7 @@ out:
if (IS_ERR(fs_root))
err = PTR_ERR(fs_root);
else
- btrfs_orphan_cleanup(fs_root);
+ err = btrfs_orphan_cleanup(fs_root);
}
return err;
}
@@ -4224,7 +4245,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list);
+ disk_bytenr + len - 1, &list, 0);
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6a1086e83ffc..ebe45443de06 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,53 +22,6 @@
#include "print-tree.h"
/*
- * search forward for a root, starting with objectid 'search_start'
- * if a root key is found, the objectid we find is filled into 'found_objectid'
- * and 0 is returned. < 0 is returned on error, 1 if there is nothing
- * left in the tree.
- */
-int btrfs_search_root(struct btrfs_root *root, u64 search_start,
- u64 *found_objectid)
-{
- struct btrfs_path *path;
- struct btrfs_key search_key;
- int ret;
-
- root = root->fs_info->tree_root;
- search_key.objectid = search_start;
- search_key.type = (u8)-1;
- search_key.offset = (u64)-1;
-
- path = btrfs_alloc_path();
- BUG_ON(!path);
-again:
- ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret < 0)
- goto out;
- if (ret == 0) {
- ret = 1;
- goto out;
- }
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
- goto out;
- }
- btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
- if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
- search_key.offset++;
- btrfs_release_path(root, path);
- goto again;
- }
- ret = 0;
- *found_objectid = search_key.objectid;
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-/*
* lookup the root with the highest offset for a given objectid. The key we do
* find is copied into 'key'. If we find something return 0, otherwise 1, < 0
* on error.
@@ -88,7 +41,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
search_key.offset = (u64)-1;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
goto out;
@@ -229,7 +183,7 @@ again:
memcpy(&found_key, &key, sizeof(key));
key.offset++;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
dead_root =
btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
&found_key);
@@ -291,7 +245,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- btrfs_release_path(tree_root, path);
+ btrfs_release_path(path);
if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
@@ -332,7 +286,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *leaf;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
ret = btrfs_search_slot(trans, root, key, path, -1, 1);
if (ret < 0)
goto out;
@@ -383,18 +338,22 @@ again:
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
} else
err = -ENOENT;
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
- btrfs_release_path(tree_root, path);
+ btrfs_release_path(path);
key.objectid = ref_id;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = root_id;
goto again;
}
+out:
btrfs_free_path(path);
return err;
}
@@ -461,7 +420,7 @@ again:
btrfs_mark_buffer_dirty(leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
- btrfs_release_path(tree_root, path);
+ btrfs_release_path(path);
key.objectid = ref_id;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = root_id;
@@ -471,3 +430,21 @@ again:
btrfs_free_path(path);
return 0;
}
+
+/*
+ * Old btrfs forgets to init root_item->flags and root_item->byte_limit
+ * for subvolumes. To work around this problem, we steal a bit from
+ * root_item->inode_item->flags, and use it to indicate if those fields
+ * have been properly initialized.
+ */
+void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
+{
+ u64 inode_flags = le64_to_cpu(root_item->inode.flags);
+
+ if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
+ inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
+ root_item->inode.flags = cpu_to_le64(inode_flags);
+ root_item->flags = 0;
+ root_item->byte_limit = 0;
+ }
+}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644
index 000000000000..6dfed0c27ac3
--- /dev/null
+++ b/fs/btrfs/scrub.c
@@ -0,0 +1,1369 @@
+/*
+ * Copyright (C) 2011 STRATO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "ordered-data.h"
+
+/*
+ * This is only the first step towards a full-features scrub. It reads all
+ * extent and super block and verifies the checksums. In case a bad checksum
+ * is found or the extent cannot be read, good data will be written back if
+ * any can be found.
+ *
+ * Future enhancements:
+ * - To enhance the performance, better read-ahead strategies for the
+ * extent-tree can be employed.
+ * - In case an unrepairable extent is encountered, track which files are
+ * affected and report them
+ * - In case of a read error on files with nodatasum, map the file and read
+ * the extent to trigger a writeback of the good copy
+ * - track and record media errors, throw out bad devices
+ * - add a mode to also read unallocated space
+ * - make the prefetch cancellable
+ */
+
+struct scrub_bio;
+struct scrub_page;
+struct scrub_dev;
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_checksum(struct btrfs_work *work);
+static int scrub_checksum_data(struct scrub_dev *sdev,
+ struct scrub_page *spag, void *buffer);
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+ struct scrub_page *spag, u64 logical,
+ void *buffer);
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
+static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
+static void scrub_fixup_end_io(struct bio *bio, int err);
+static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
+ struct page *page);
+static void scrub_fixup(struct scrub_bio *sbio, int ix);
+
+#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
+#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
+
+struct scrub_page {
+ u64 flags; /* extent flags */
+ u64 generation;
+ u64 mirror_num;
+ int have_csum;
+ u8 csum[BTRFS_CSUM_SIZE];
+};
+
+struct scrub_bio {
+ int index;
+ struct scrub_dev *sdev;
+ struct bio *bio;
+ int err;
+ u64 logical;
+ u64 physical;
+ struct scrub_page spag[SCRUB_PAGES_PER_BIO];
+ u64 count;
+ int next_free;
+ struct btrfs_work work;
+};
+
+struct scrub_dev {
+ struct scrub_bio *bios[SCRUB_BIOS_PER_DEV];
+ struct btrfs_device *dev;
+ int first_free;
+ int curr;
+ atomic_t in_flight;
+ spinlock_t list_lock;
+ wait_queue_head_t list_wait;
+ u16 csum_size;
+ struct list_head csum_list;
+ atomic_t cancel_req;
+ int readonly;
+ /*
+ * statistics
+ */
+ struct btrfs_scrub_progress stat;
+ spinlock_t stat_lock;
+};
+
+static void scrub_free_csums(struct scrub_dev *sdev)
+{
+ while (!list_empty(&sdev->csum_list)) {
+ struct btrfs_ordered_sum *sum;
+ sum = list_first_entry(&sdev->csum_list,
+ struct btrfs_ordered_sum, list);
+ list_del(&sum->list);
+ kfree(sum);
+ }
+}
+
+static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+{
+ int i;
+ int j;
+ struct page *last_page;
+
+ if (!sdev)
+ return;
+
+ for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+ struct scrub_bio *sbio = sdev->bios[i];
+ struct bio *bio;
+
+ if (!sbio)
+ break;
+
+ bio = sbio->bio;
+ if (bio) {
+ last_page = NULL;
+ for (j = 0; j < bio->bi_vcnt; ++j) {
+ if (bio->bi_io_vec[j].bv_page == last_page)
+ continue;
+ last_page = bio->bi_io_vec[j].bv_page;
+ __free_page(last_page);
+ }
+ bio_put(bio);
+ }
+ kfree(sbio);
+ }
+
+ scrub_free_csums(sdev);
+ kfree(sdev);
+}
+
+static noinline_for_stack
+struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+{
+ struct scrub_dev *sdev;
+ int i;
+ int j;
+ int ret;
+ struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+
+ sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
+ if (!sdev)
+ goto nomem;
+ sdev->dev = dev;
+ for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+ struct bio *bio;
+ struct scrub_bio *sbio;
+
+ sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+ if (!sbio)
+ goto nomem;
+ sdev->bios[i] = sbio;
+
+ bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
+ if (!bio)
+ goto nomem;
+
+ sbio->index = i;
+ sbio->sdev = sdev;
+ sbio->bio = bio;
+ sbio->count = 0;
+ sbio->work.func = scrub_checksum;
+ bio->bi_private = sdev->bios[i];
+ bio->bi_end_io = scrub_bio_end_io;
+ bio->bi_sector = 0;
+ bio->bi_bdev = dev->bdev;
+ bio->bi_size = 0;
+
+ for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
+ struct page *page;
+ page = alloc_page(GFP_NOFS);
+ if (!page)
+ goto nomem;
+
+ ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+ if (!ret)
+ goto nomem;
+ }
+ WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
+
+ if (i != SCRUB_BIOS_PER_DEV-1)
+ sdev->bios[i]->next_free = i + 1;
+ else
+ sdev->bios[i]->next_free = -1;
+ }
+ sdev->first_free = 0;
+ sdev->curr = -1;
+ atomic_set(&sdev->in_flight, 0);
+ atomic_set(&sdev->cancel_req, 0);
+ sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+ INIT_LIST_HEAD(&sdev->csum_list);
+
+ spin_lock_init(&sdev->list_lock);
+ spin_lock_init(&sdev->stat_lock);
+ init_waitqueue_head(&sdev->list_wait);
+ return sdev;
+
+nomem:
+ scrub_free_dev(sdev);
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * scrub_recheck_error gets called when either verification of the page
+ * failed or the bio failed to read, e.g. with EIO. In the latter case,
+ * recheck_error gets called for every page in the bio, even though only
+ * one may be bad
+ */
+static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+{
+ if (sbio->err) {
+ if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
+ (sbio->physical + ix * PAGE_SIZE) >> 9,
+ sbio->bio->bi_io_vec[ix].bv_page) == 0) {
+ if (scrub_fixup_check(sbio, ix) == 0)
+ return;
+ }
+ }
+
+ scrub_fixup(sbio, ix);
+}
+
+static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
+{
+ int ret = 1;
+ struct page *page;
+ void *buffer;
+ u64 flags = sbio->spag[ix].flags;
+
+ page = sbio->bio->bi_io_vec[ix].bv_page;
+ buffer = kmap_atomic(page, KM_USER0);
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ ret = scrub_checksum_data(sbio->sdev,
+ sbio->spag + ix, buffer);
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ ret = scrub_checksum_tree_block(sbio->sdev,
+ sbio->spag + ix,
+ sbio->logical + ix * PAGE_SIZE,
+ buffer);
+ } else {
+ WARN_ON(1);
+ }
+ kunmap_atomic(buffer, KM_USER0);
+
+ return ret;
+}
+
+static void scrub_fixup_end_io(struct bio *bio, int err)
+{
+ complete((struct completion *)bio->bi_private);
+}
+
+static void scrub_fixup(struct scrub_bio *sbio, int ix)
+{
+ struct scrub_dev *sdev = sbio->sdev;
+ struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+ struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+ struct btrfs_multi_bio *multi = NULL;
+ u64 logical = sbio->logical + ix * PAGE_SIZE;
+ u64 length;
+ int i;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
+ (sbio->spag[ix].have_csum == 0)) {
+ /*
+ * nodatasum, don't try to fix anything
+ * FIXME: we can do better, open the inode and trigger a
+ * writeback
+ */
+ goto uncorrectable;
+ }
+
+ length = PAGE_SIZE;
+ ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
+ &multi, 0);
+ if (ret || !multi || length < PAGE_SIZE) {
+ printk(KERN_ERR
+ "scrub_fixup: btrfs_map_block failed us for %llu\n",
+ (unsigned long long)logical);
+ WARN_ON(1);
+ return;
+ }
+
+ if (multi->num_stripes == 1)
+ /* there aren't any replicas */
+ goto uncorrectable;
+
+ /*
+ * first find a good copy
+ */
+ for (i = 0; i < multi->num_stripes; ++i) {
+ if (i == sbio->spag[ix].mirror_num)
+ continue;
+
+ if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
+ multi->stripes[i].physical >> 9,
+ sbio->bio->bi_io_vec[ix].bv_page)) {
+ /* I/O-error, this is not a good copy */
+ continue;
+ }
+
+ if (scrub_fixup_check(sbio, ix) == 0)
+ break;
+ }
+ if (i == multi->num_stripes)
+ goto uncorrectable;
+
+ if (!sdev->readonly) {
+ /*
+ * bi_io_vec[ix].bv_page now contains good data, write it back
+ */
+ if (scrub_fixup_io(WRITE, sdev->dev->bdev,
+ (sbio->physical + ix * PAGE_SIZE) >> 9,
+ sbio->bio->bi_io_vec[ix].bv_page)) {
+ /* I/O-error, writeback failed, give up */
+ goto uncorrectable;
+ }
+ }
+
+ kfree(multi);
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.corrected_errors;
+ spin_unlock(&sdev->stat_lock);
+
+ if (printk_ratelimit())
+ printk(KERN_ERR "btrfs: fixed up at %llu\n",
+ (unsigned long long)logical);
+ return;
+
+uncorrectable:
+ kfree(multi);
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.uncorrectable_errors;
+ spin_unlock(&sdev->stat_lock);
+
+ if (printk_ratelimit())
+ printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
+ (unsigned long long)logical);
+}
+
+static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
+ struct page *page)
+{
+ struct bio *bio = NULL;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(complete);
+
+ /* we are going to wait on this IO */
+ rw |= REQ_SYNC;
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ bio->bi_bdev = bdev;
+ bio->bi_sector = sector;
+ bio_add_page(bio, page, PAGE_SIZE, 0);
+ bio->bi_end_io = scrub_fixup_end_io;
+ bio->bi_private = &complete;
+ submit_bio(rw, bio);
+
+ wait_for_completion(&complete);
+
+ ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio_put(bio);
+ return ret;
+}
+
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+ struct scrub_bio *sbio = bio->bi_private;
+ struct scrub_dev *sdev = sbio->sdev;
+ struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+
+ sbio->err = err;
+
+ btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_checksum(struct btrfs_work *work)
+{
+ struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+ struct scrub_dev *sdev = sbio->sdev;
+ struct page *page;
+ void *buffer;
+ int i;
+ u64 flags;
+ u64 logical;
+ int ret;
+
+ if (sbio->err) {
+ for (i = 0; i < sbio->count; ++i)
+ scrub_recheck_error(sbio, i);
+
+ sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+ sbio->bio->bi_phys_segments = 0;
+ sbio->bio->bi_idx = 0;
+
+ for (i = 0; i < sbio->count; i++) {
+ struct bio_vec *bi;
+ bi = &sbio->bio->bi_io_vec[i];
+ bi->bv_offset = 0;
+ bi->bv_len = PAGE_SIZE;
+ }
+
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.read_errors;
+ spin_unlock(&sdev->stat_lock);
+ goto out;
+ }
+ for (i = 0; i < sbio->count; ++i) {
+ page = sbio->bio->bi_io_vec[i].bv_page;
+ buffer = kmap_atomic(page, KM_USER0);
+ flags = sbio->spag[i].flags;
+ logical = sbio->logical + i * PAGE_SIZE;
+ ret = 0;
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
+ } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
+ logical, buffer);
+ } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
+ BUG_ON(i);
+ (void)scrub_checksum_super(sbio, buffer);
+ } else {
+ WARN_ON(1);
+ }
+ kunmap_atomic(buffer, KM_USER0);
+ if (ret)
+ scrub_recheck_error(sbio, i);
+ }
+
+out:
+ spin_lock(&sdev->list_lock);
+ sbio->next_free = sdev->first_free;
+ sdev->first_free = sbio->index;
+ spin_unlock(&sdev->list_lock);
+ atomic_dec(&sdev->in_flight);
+ wake_up(&sdev->list_wait);
+}
+
+static int scrub_checksum_data(struct scrub_dev *sdev,
+ struct scrub_page *spag, void *buffer)
+{
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ int fail = 0;
+ struct btrfs_root *root = sdev->dev->dev_root;
+
+ if (!spag->have_csum)
+ return 0;
+
+ crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+ btrfs_csum_final(crc, csum);
+ if (memcmp(csum, spag->csum, sdev->csum_size))
+ fail = 1;
+
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.data_extents_scrubbed;
+ sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
+ if (fail)
+ ++sdev->stat.csum_errors;
+ spin_unlock(&sdev->stat_lock);
+
+ return fail;
+}
+
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+ struct scrub_page *spag, u64 logical,
+ void *buffer)
+{
+ struct btrfs_header *h;
+ struct btrfs_root *root = sdev->dev->dev_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ int fail = 0;
+ int crc_fail = 0;
+
+ /*
+ * we don't use the getter functions here, as we
+ * a) don't have an extent buffer and
+ * b) the page is already kmapped
+ */
+ h = (struct btrfs_header *)buffer;
+
+ if (logical != le64_to_cpu(h->bytenr))
+ ++fail;
+
+ if (spag->generation != le64_to_cpu(h->generation))
+ ++fail;
+
+ if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+ ++fail;
+
+ if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+ BTRFS_UUID_SIZE))
+ ++fail;
+
+ crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+ PAGE_SIZE - BTRFS_CSUM_SIZE);
+ btrfs_csum_final(crc, csum);
+ if (memcmp(csum, h->csum, sdev->csum_size))
+ ++crc_fail;
+
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.tree_extents_scrubbed;
+ sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
+ if (crc_fail)
+ ++sdev->stat.csum_errors;
+ if (fail)
+ ++sdev->stat.verify_errors;
+ spin_unlock(&sdev->stat_lock);
+
+ return fail || crc_fail;
+}
+
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+{
+ struct btrfs_super_block *s;
+ u64 logical;
+ struct scrub_dev *sdev = sbio->sdev;
+ struct btrfs_root *root = sdev->dev->dev_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u8 csum[BTRFS_CSUM_SIZE];
+ u32 crc = ~(u32)0;
+ int fail = 0;
+
+ s = (struct btrfs_super_block *)buffer;
+ logical = sbio->logical;
+
+ if (logical != le64_to_cpu(s->bytenr))
+ ++fail;
+
+ if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+ ++fail;
+
+ if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+ ++fail;
+
+ crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+ PAGE_SIZE - BTRFS_CSUM_SIZE);
+ btrfs_csum_final(crc, csum);
+ if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+ ++fail;
+
+ if (fail) {
+ /*
+ * if we find an error in a super block, we just report it.
+ * They will get written with the next transaction commit
+ * anyway
+ */
+ spin_lock(&sdev->stat_lock);
+ ++sdev->stat.super_errors;
+ spin_unlock(&sdev->stat_lock);
+ }
+
+ return fail;
+}
+
+static int scrub_submit(struct scrub_dev *sdev)
+{
+ struct scrub_bio *sbio;
+
+ if (sdev->curr == -1)
+ return 0;
+
+ sbio = sdev->bios[sdev->curr];
+
+ sbio->bio->bi_sector = sbio->physical >> 9;
+ sbio->bio->bi_size = sbio->count * PAGE_SIZE;
+ sbio->bio->bi_next = NULL;
+ sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+ sbio->bio->bi_comp_cpu = -1;
+ sbio->bio->bi_bdev = sdev->dev->bdev;
+ sbio->err = 0;
+ sdev->curr = -1;
+ atomic_inc(&sdev->in_flight);
+
+ submit_bio(0, sbio->bio);
+
+ return 0;
+}
+
+static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
+ u64 physical, u64 flags, u64 gen, u64 mirror_num,
+ u8 *csum, int force)
+{
+ struct scrub_bio *sbio;
+
+again:
+ /*
+ * grab a fresh bio or wait for one to become available
+ */
+ while (sdev->curr == -1) {
+ spin_lock(&sdev->list_lock);
+ sdev->curr = sdev->first_free;
+ if (sdev->curr != -1) {
+ sdev->first_free = sdev->bios[sdev->curr]->next_free;
+ sdev->bios[sdev->curr]->next_free = -1;
+ sdev->bios[sdev->curr]->count = 0;
+ spin_unlock(&sdev->list_lock);
+ } else {
+ spin_unlock(&sdev->list_lock);
+ wait_event(sdev->list_wait, sdev->first_free != -1);
+ }
+ }
+ sbio = sdev->bios[sdev->curr];
+ if (sbio->count == 0) {
+ sbio->physical = physical;
+ sbio->logical = logical;
+ } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
+ sbio->logical + sbio->count * PAGE_SIZE != logical) {
+ scrub_submit(sdev);
+ goto again;
+ }
+ sbio->spag[sbio->count].flags = flags;
+ sbio->spag[sbio->count].generation = gen;
+ sbio->spag[sbio->count].have_csum = 0;
+ sbio->spag[sbio->count].mirror_num = mirror_num;
+ if (csum) {
+ sbio->spag[sbio->count].have_csum = 1;
+ memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+ }
+ ++sbio->count;
+ if (sbio->count == SCRUB_PAGES_PER_BIO || force)
+ scrub_submit(sdev);
+
+ return 0;
+}
+
+static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+ u8 *csum)
+{
+ struct btrfs_ordered_sum *sum = NULL;
+ int ret = 0;
+ unsigned long i;
+ unsigned long num_sectors;
+ u32 sectorsize = sdev->dev->dev_root->sectorsize;
+
+ while (!list_empty(&sdev->csum_list)) {
+ sum = list_first_entry(&sdev->csum_list,
+ struct btrfs_ordered_sum, list);
+ if (sum->bytenr > logical)
+ return 0;
+ if (sum->bytenr + sum->len > logical)
+ break;
+
+ ++sdev->stat.csum_discards;
+ list_del(&sum->list);
+ kfree(sum);
+ sum = NULL;
+ }
+ if (!sum)
+ return 0;
+
+ num_sectors = sum->len / sectorsize;
+ for (i = 0; i < num_sectors; ++i) {
+ if (sum->sums[i].bytenr == logical) {
+ memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+ ret = 1;
+ break;
+ }
+ }
+ if (ret && i == num_sectors - 1) {
+ list_del(&sum->list);
+ kfree(sum);
+ }
+ return ret;
+}
+
+/* scrub extent tries to collect up to 64 kB for each bio */
+static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
+ u64 physical, u64 flags, u64 gen, u64 mirror_num)
+{
+ int ret;
+ u8 csum[BTRFS_CSUM_SIZE];
+
+ while (len) {
+ u64 l = min_t(u64, len, PAGE_SIZE);
+ int have_csum = 0;
+
+ if (flags & BTRFS_EXTENT_FLAG_DATA) {
+ /* push csums to sbio */
+ have_csum = scrub_find_csum(sdev, logical, l, csum);
+ if (have_csum == 0)
+ ++sdev->stat.no_csum;
+ }
+ ret = scrub_page(sdev, logical, l, physical, flags, gen,
+ mirror_num, have_csum ? csum : NULL, 0);
+ if (ret)
+ return ret;
+ len -= l;
+ logical += l;
+ physical += l;
+ }
+ return 0;
+}
+
+static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
+ struct map_lookup *map, int num, u64 base, u64 length)
+{
+ struct btrfs_path *path;
+ struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+ struct btrfs_root *root = fs_info->extent_root;
+ struct btrfs_root *csum_root = fs_info->csum_root;
+ struct btrfs_extent_item *extent;
+ u64 flags;
+ int ret;
+ int slot;
+ int i;
+ u64 nstripes;
+ int start_stripe;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ u64 physical;
+ u64 logical;
+ u64 generation;
+ u64 mirror_num;
+
+ u64 increment = map->stripe_len;
+ u64 offset;
+
+ nstripes = length;
+ offset = 0;
+ do_div(nstripes, map->stripe_len);
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ offset = map->stripe_len * num;
+ increment = map->stripe_len * map->num_stripes;
+ mirror_num = 0;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ int factor = map->num_stripes / map->sub_stripes;
+ offset = map->stripe_len * (num / map->sub_stripes);
+ increment = map->stripe_len * factor;
+ mirror_num = num % map->sub_stripes;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+ increment = map->stripe_len;
+ mirror_num = num % map->num_stripes;
+ } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+ increment = map->stripe_len;
+ mirror_num = num % map->num_stripes;
+ } else {
+ increment = map->stripe_len;
+ mirror_num = 0;
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 2;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ /*
+ * find all extents for each stripe and just read them to get
+ * them into the page cache
+ * FIXME: we can do better. build a more intelligent prefetching
+ */
+ logical = base + offset;
+ physical = map->stripes[num].physical;
+ ret = 0;
+ for (i = 0; i < nstripes; ++i) {
+ key.objectid = logical;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(l, &key, slot);
+ if (key.objectid != logical) {
+ ret = btrfs_previous_item(root, path, 0,
+ BTRFS_EXTENT_ITEM_KEY);
+ if (ret < 0)
+ goto out;
+ }
+
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid >= logical + map->stripe_len)
+ break;
+
+ path->slots[0]++;
+ }
+ btrfs_release_path(path);
+ logical += increment;
+ physical += map->stripe_len;
+ cond_resched();
+ }
+
+ /*
+ * collect all data csums for the stripe to avoid seeking during
+ * the scrub. This might currently (crc32) end up to be about 1MB
+ */
+ start_stripe = 0;
+again:
+ logical = base + offset + start_stripe * increment;
+ for (i = start_stripe; i < nstripes; ++i) {
+ ret = btrfs_lookup_csums_range(csum_root, logical,
+ logical + map->stripe_len - 1,
+ &sdev->csum_list, 1);
+ if (ret)
+ goto out;
+
+ logical += increment;
+ cond_resched();
+ }
+ /*
+ * now find all extents for each stripe and scrub them
+ */
+ logical = base + offset + start_stripe * increment;
+ physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+ ret = 0;
+ for (i = start_stripe; i < nstripes; ++i) {
+ /*
+ * canceled?
+ */
+ if (atomic_read(&fs_info->scrub_cancel_req) ||
+ atomic_read(&sdev->cancel_req)) {
+ ret = -ECANCELED;
+ goto out;
+ }
+ /*
+ * check to see if we have to pause
+ */
+ if (atomic_read(&fs_info->scrub_pause_req)) {
+ /* push queued extents */
+ scrub_submit(sdev);
+ wait_event(sdev->list_wait,
+ atomic_read(&sdev->in_flight) == 0);
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
+ mutex_lock(&fs_info->scrub_lock);
+ while (atomic_read(&fs_info->scrub_pause_req)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrub_pause_req) == 0);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+ wake_up(&fs_info->scrub_pause_wait);
+ scrub_free_csums(sdev);
+ start_stripe = i;
+ goto again;
+ }
+
+ key.objectid = logical;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+ btrfs_item_key_to_cpu(l, &key, slot);
+ if (key.objectid != logical) {
+ ret = btrfs_previous_item(root, path, 0,
+ BTRFS_EXTENT_ITEM_KEY);
+ if (ret < 0)
+ goto out;
+ }
+
+ while (1) {
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(l)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+ btrfs_item_key_to_cpu(l, &key, slot);
+
+ if (key.objectid + key.offset <= logical)
+ goto next;
+
+ if (key.objectid >= logical + map->stripe_len)
+ break;
+
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+ goto next;
+
+ extent = btrfs_item_ptr(l, slot,
+ struct btrfs_extent_item);
+ flags = btrfs_extent_flags(l, extent);
+ generation = btrfs_extent_generation(l, extent);
+
+ if (key.objectid < logical &&
+ (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+ printk(KERN_ERR
+ "btrfs scrub: tree block %llu spanning "
+ "stripes, ignored. logical=%llu\n",
+ (unsigned long long)key.objectid,
+ (unsigned long long)logical);
+ goto next;
+ }
+
+ /*
+ * trim extent to this stripe
+ */
+ if (key.objectid < logical) {
+ key.offset -= logical - key.objectid;
+ key.objectid = logical;
+ }
+ if (key.objectid + key.offset >
+ logical + map->stripe_len) {
+ key.offset = logical + map->stripe_len -
+ key.objectid;
+ }
+
+ ret = scrub_extent(sdev, key.objectid, key.offset,
+ key.objectid - logical + physical,
+ flags, generation, mirror_num);
+ if (ret)
+ goto out;
+
+next:
+ path->slots[0]++;
+ }
+ btrfs_release_path(path);
+ logical += increment;
+ physical += map->stripe_len;
+ spin_lock(&sdev->stat_lock);
+ sdev->stat.last_physical = physical;
+ spin_unlock(&sdev->stat_lock);
+ }
+ /* push queued extents */
+ scrub_submit(sdev);
+
+out:
+ btrfs_free_path(path);
+ return ret < 0 ? ret : 0;
+}
+
+static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
+ u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
+{
+ struct btrfs_mapping_tree *map_tree =
+ &sdev->dev->dev_root->fs_info->mapping_tree;
+ struct map_lookup *map;
+ struct extent_map *em;
+ int i;
+ int ret = -EINVAL;
+
+ read_lock(&map_tree->map_tree.lock);
+ em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+ read_unlock(&map_tree->map_tree.lock);
+
+ if (!em)
+ return -EINVAL;
+
+ map = (struct map_lookup *)em->bdev;
+ if (em->start != chunk_offset)
+ goto out;
+
+ if (em->len < length)
+ goto out;
+
+ for (i = 0; i < map->num_stripes; ++i) {
+ if (map->stripes[i].dev == sdev->dev) {
+ ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+ if (ret)
+ goto out;
+ }
+ }
+out:
+ free_extent_map(em);
+
+ return ret;
+}
+
+static noinline_for_stack
+int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+{
+ struct btrfs_dev_extent *dev_extent = NULL;
+ struct btrfs_path *path;
+ struct btrfs_root *root = sdev->dev->dev_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 length;
+ u64 chunk_tree;
+ u64 chunk_objectid;
+ u64 chunk_offset;
+ int ret;
+ int slot;
+ struct extent_buffer *l;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
+ struct btrfs_block_group_cache *cache;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ path->reada = 2;
+ path->search_commit_root = 1;
+ path->skip_locking = 1;
+
+ key.objectid = sdev->dev->devid;
+ key.offset = 0ull;
+ key.type = BTRFS_DEV_EXTENT_KEY;
+
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+ l = path->nodes[0];
+ slot = path->slots[0];
+
+ btrfs_item_key_to_cpu(l, &found_key, slot);
+
+ if (found_key.objectid != sdev->dev->devid)
+ break;
+
+ if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+ break;
+
+ if (found_key.offset >= end)
+ break;
+
+ if (found_key.offset < key.offset)
+ break;
+
+ dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+ length = btrfs_dev_extent_length(l, dev_extent);
+
+ if (found_key.offset + length <= start) {
+ key.offset = found_key.offset + length;
+ btrfs_release_path(path);
+ continue;
+ }
+
+ chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+ chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+ chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+
+ /*
+ * get a reference on the corresponding block group to prevent
+ * the chunk from going away while we scrub it
+ */
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ if (!cache) {
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
+ chunk_offset, length);
+ btrfs_put_block_group(cache);
+ if (ret)
+ break;
+
+ key.offset = found_key.offset + length;
+ btrfs_release_path(path);
+ }
+
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+{
+ int i;
+ u64 bytenr;
+ u64 gen;
+ int ret;
+ struct btrfs_device *device = sdev->dev;
+ struct btrfs_root *root = device->dev_root;
+
+ gen = root->fs_info->last_trans_committed;
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+ break;
+
+ ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
+ BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+ if (ret)
+ return ret;
+ }
+ wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+
+ return 0;
+}
+
+/*
+ * get a reference count on fs_info->scrub_workers. start worker if necessary
+ */
+static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (fs_info->scrub_workers_refcnt == 0)
+ btrfs_start_workers(&fs_info->scrub_workers, 1);
+ ++fs_info->scrub_workers_refcnt;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ return 0;
+}
+
+static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (--fs_info->scrub_workers_refcnt == 0)
+ btrfs_stop_workers(&fs_info->scrub_workers);
+ WARN_ON(fs_info->scrub_workers_refcnt < 0);
+ mutex_unlock(&fs_info->scrub_lock);
+}
+
+
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+ struct btrfs_scrub_progress *progress, int readonly)
+{
+ struct scrub_dev *sdev;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+ struct btrfs_device *dev;
+
+ if (root->fs_info->closing)
+ return -EINVAL;
+
+ /*
+ * check some assumptions
+ */
+ if (root->sectorsize != PAGE_SIZE ||
+ root->sectorsize != root->leafsize ||
+ root->sectorsize != root->nodesize) {
+ printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+ return -EINVAL;
+ }
+
+ ret = scrub_workers_get(root);
+ if (ret)
+ return ret;
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ dev = btrfs_find_device(root, devid, NULL, NULL);
+ if (!dev || dev->missing) {
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ scrub_workers_put(root);
+ return -ENODEV;
+ }
+ mutex_lock(&fs_info->scrub_lock);
+
+ if (!dev->in_fs_metadata) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ scrub_workers_put(root);
+ return -ENODEV;
+ }
+
+ if (dev->scrub_device) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ scrub_workers_put(root);
+ return -EINPROGRESS;
+ }
+ sdev = scrub_setup_dev(dev);
+ if (IS_ERR(sdev)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ scrub_workers_put(root);
+ return PTR_ERR(sdev);
+ }
+ sdev->readonly = readonly;
+ dev->scrub_device = sdev;
+
+ atomic_inc(&fs_info->scrubs_running);
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ down_read(&fs_info->scrub_super_lock);
+ ret = scrub_supers(sdev);
+ up_read(&fs_info->scrub_super_lock);
+
+ if (!ret)
+ ret = scrub_enumerate_chunks(sdev, start, end);
+
+ wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+
+ atomic_dec(&fs_info->scrubs_running);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ if (progress)
+ memcpy(progress, &sdev->stat, sizeof(*progress));
+
+ mutex_lock(&fs_info->scrub_lock);
+ dev->scrub_device = NULL;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ scrub_free_dev(sdev);
+ scrub_workers_put(root);
+
+ return ret;
+}
+
+int btrfs_scrub_pause(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ mutex_lock(&fs_info->scrub_lock);
+ atomic_inc(&fs_info->scrub_pause_req);
+ while (atomic_read(&fs_info->scrubs_paused) !=
+ atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrubs_paused) ==
+ atomic_read(&fs_info->scrubs_running));
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ mutex_unlock(&fs_info->scrub_lock);
+
+ return 0;
+}
+
+int btrfs_scrub_continue(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ atomic_dec(&fs_info->scrub_pause_req);
+ wake_up(&fs_info->scrub_pause_wait);
+ return 0;
+}
+
+int btrfs_scrub_pause_super(struct btrfs_root *root)
+{
+ down_write(&root->fs_info->scrub_super_lock);
+ return 0;
+}
+
+int btrfs_scrub_continue_super(struct btrfs_root *root)
+{
+ up_write(&root->fs_info->scrub_super_lock);
+ return 0;
+}
+
+int btrfs_scrub_cancel(struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (!atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ return -ENOTCONN;
+ }
+
+ atomic_inc(&fs_info->scrub_cancel_req);
+ while (atomic_read(&fs_info->scrubs_running)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrubs_running) == 0);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ atomic_dec(&fs_info->scrub_cancel_req);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ return 0;
+}
+
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct scrub_dev *sdev;
+
+ mutex_lock(&fs_info->scrub_lock);
+ sdev = dev->scrub_device;
+ if (!sdev) {
+ mutex_unlock(&fs_info->scrub_lock);
+ return -ENOTCONN;
+ }
+ atomic_inc(&sdev->cancel_req);
+ while (dev->scrub_device) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ dev->scrub_device == NULL);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+ mutex_unlock(&fs_info->scrub_lock);
+
+ return 0;
+}
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_device *dev;
+ int ret;
+
+ /*
+ * we have to hold the device_list_mutex here so the device
+ * does not go away in cancel_dev. FIXME: find a better solution
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ dev = btrfs_find_device(root, devid, NULL, NULL);
+ if (!dev) {
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ return -ENODEV;
+ }
+ ret = btrfs_scrub_cancel_dev(root, dev);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ return ret;
+}
+
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+ struct btrfs_scrub_progress *progress)
+{
+ struct btrfs_device *dev;
+ struct scrub_dev *sdev = NULL;
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ dev = btrfs_find_device(root, devid, NULL, NULL);
+ if (dev)
+ sdev = dev->scrub_device;
+ if (sdev)
+ memcpy(progress, &sdev->stat, sizeof(*progress));
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+ return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a004008f7d28..9b2e7e5bc3ef 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,7 +39,9 @@
#include <linux/miscdevice.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include <linux/cleancache.h>
#include "compat.h"
+#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -52,6 +54,9 @@
#include "export.h"
#include "compression.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/btrfs.h>
+
static const struct super_operations btrfs_super_ops;
static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
@@ -155,7 +160,8 @@ enum {
Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
- Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
+ Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+ Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err,
};
static match_table_t tokens = {
@@ -184,6 +190,9 @@ static match_table_t tokens = {
{Opt_space_cache, "space_cache"},
{Opt_clear_cache, "clear_cache"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+ {Opt_enospc_debug, "enospc_debug"},
+ {Opt_subvolrootid, "subvolrootid=%d"},
+ {Opt_defrag, "autodefrag"},
{Opt_err, NULL},
};
@@ -227,6 +236,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
break;
case Opt_subvol:
case Opt_subvolid:
+ case Opt_subvolrootid:
case Opt_device:
/*
* These are parsed by btrfs_parse_early_options
@@ -358,6 +368,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_user_subvol_rm_allowed:
btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
break;
+ case Opt_enospc_debug:
+ btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_defrag:
+ printk(KERN_INFO "btrfs: enabling auto defrag");
+ btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
+ break;
case Opt_err:
printk(KERN_INFO "btrfs: unrecognized mount option "
"'%s'\n", p);
@@ -380,7 +397,7 @@ out:
*/
static int btrfs_parse_early_options(const char *options, fmode_t flags,
void *holder, char **subvol_name, u64 *subvol_objectid,
- struct btrfs_fs_devices **fs_devices)
+ u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
{
substring_t args[MAX_OPT_ARGS];
char *opts, *orig, *p;
@@ -421,6 +438,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
*subvol_objectid = intarg;
}
break;
+ case Opt_subvolrootid:
+ intarg = 0;
+ error = match_int(&args[0], &intarg);
+ if (!error) {
+ /* we want the original fs_tree */
+ if (!intarg)
+ *subvol_rootid =
+ BTRFS_FS_TREE_OBJECTID;
+ else
+ *subvol_rootid = intarg;
+ }
+ break;
case Opt_device:
error = btrfs_scan_one_device(match_strdup(&args[0]),
flags, holder, fs_devices);
@@ -484,8 +513,10 @@ static struct dentry *get_default_root(struct super_block *sb,
*/
dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
- if (IS_ERR(di))
+ if (IS_ERR(di)) {
+ btrfs_free_path(path);
return ERR_CAST(di);
+ }
if (!di) {
/*
* Ok the default dir item isn't there. This is weird since
@@ -602,6 +633,7 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_root = root_dentry;
save_mount_options(sb, data);
+ cleancache_init_fs(sb);
return 0;
fail_close:
@@ -615,6 +647,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
struct btrfs_root *root = btrfs_sb(sb);
int ret;
+ trace_btrfs_sync_fs(wait);
+
if (!wait) {
filemap_flush(root->fs_info->btree_inode->i_mapping);
return 0;
@@ -634,6 +668,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
{
struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
struct btrfs_fs_info *info = root->fs_info;
+ char *compress_type;
if (btrfs_test_opt(root, DEGRADED))
seq_puts(seq, ",degraded");
@@ -652,8 +687,16 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (info->thread_pool_size != min_t(unsigned long,
num_online_cpus() + 2, 8))
seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
- if (btrfs_test_opt(root, COMPRESS))
- seq_puts(seq, ",compress");
+ if (btrfs_test_opt(root, COMPRESS)) {
+ if (info->compress_type == BTRFS_COMPRESS_ZLIB)
+ compress_type = "zlib";
+ else
+ compress_type = "lzo";
+ if (btrfs_test_opt(root, FORCE_COMPRESS))
+ seq_printf(seq, ",compress-force=%s", compress_type);
+ else
+ seq_printf(seq, ",compress=%s", compress_type);
+ }
if (btrfs_test_opt(root, NOSSD))
seq_puts(seq, ",nossd");
if (btrfs_test_opt(root, SSD_SPREAD))
@@ -668,6 +711,12 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",discard");
if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
seq_puts(seq, ",noacl");
+ if (btrfs_test_opt(root, SPACE_CACHE))
+ seq_puts(seq, ",space_cache");
+ if (btrfs_test_opt(root, CLEAR_CACHE))
+ seq_puts(seq, ",clear_cache");
+ if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+ seq_puts(seq, ",user_subvol_rm_allowed");
return 0;
}
@@ -700,7 +749,7 @@ static int btrfs_set_super(struct super_block *s, void *data)
* for multiple device setup. Make sure to keep it in sync.
*/
static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+ const char *device_name, void *data)
{
struct block_device *bdev = NULL;
struct super_block *s;
@@ -711,6 +760,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
fmode_t mode = FMODE_READ;
char *subvol_name = NULL;
u64 subvol_objectid = 0;
+ u64 subvol_rootid = 0;
int error = 0;
if (!(flags & MS_RDONLY))
@@ -718,11 +768,11 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
error = btrfs_parse_early_options(data, mode, fs_type,
&subvol_name, &subvol_objectid,
- &fs_devices);
+ &subvol_rootid, &fs_devices);
if (error)
return ERR_PTR(error);
- error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
+ error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
if (error)
goto error_free_subvol_name;
@@ -782,15 +832,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
s->s_flags |= MS_ACTIVE;
}
- root = get_default_root(s, subvol_objectid);
- if (IS_ERR(root)) {
- error = PTR_ERR(root);
- deactivate_locked_super(s);
- goto error_free_subvol_name;
- }
/* if they gave us a subvolume name bind mount into that */
if (strcmp(subvol_name, ".")) {
struct dentry *new_root;
+
+ root = get_default_root(s, subvol_rootid);
+ if (IS_ERR(root)) {
+ error = PTR_ERR(root);
+ deactivate_locked_super(s);
+ goto error_free_subvol_name;
+ }
+
mutex_lock(&root->d_inode->i_mutex);
new_root = lookup_one_len(subvol_name, root,
strlen(subvol_name));
@@ -811,6 +863,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
}
dput(root);
root = new_root;
+ } else {
+ root = get_default_root(s, subvol_objectid);
+ if (IS_ERR(root)) {
+ error = PTR_ERR(root);
+ deactivate_locked_super(s);
+ goto error_free_subvol_name;
+ }
}
kfree(subvol_name);
@@ -864,6 +923,32 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
+/* Used to sort the devices by max_avail(descending sort) */
+static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+ const void *dev_info2)
+{
+ if (((struct btrfs_device_info *)dev_info1)->max_avail >
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return -1;
+ else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+ ((struct btrfs_device_info *)dev_info2)->max_avail)
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+ struct btrfs_device_info *devices,
+ size_t nr_devices)
+{
+ sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_free_bytes, NULL);
+}
+
/*
* The helper to calc the free space on the devices that can be used to store
* file data.
@@ -1157,10 +1242,14 @@ static int __init init_btrfs_fs(void)
if (err)
goto free_extent_io;
- err = btrfs_interface_init();
+ err = btrfs_delayed_inode_init();
if (err)
goto free_extent_map;
+ err = btrfs_interface_init();
+ if (err)
+ goto free_delayed_inode;
+
err = register_filesystem(&btrfs_fs_type);
if (err)
goto unregister_ioctl;
@@ -1170,6 +1259,8 @@ static int __init init_btrfs_fs(void)
unregister_ioctl:
btrfs_interface_exit();
+free_delayed_inode:
+ btrfs_delayed_inode_exit();
free_extent_map:
extent_map_exit();
free_extent_io:
@@ -1186,6 +1277,7 @@ free_sysfs:
static void __exit exit_btrfs_fs(void)
{
btrfs_destroy_cachep();
+ btrfs_delayed_inode_exit();
extent_map_exit();
extent_io_exit();
btrfs_interface_exit();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4ce16ef702a3..c3c223ae6691 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -174,86 +174,9 @@ static const struct sysfs_ops btrfs_root_attr_ops = {
.store = btrfs_root_attr_store,
};
-static struct kobj_type btrfs_root_ktype = {
- .default_attrs = btrfs_root_attrs,
- .sysfs_ops = &btrfs_root_attr_ops,
- .release = btrfs_root_release,
-};
-
-static struct kobj_type btrfs_super_ktype = {
- .default_attrs = btrfs_super_attrs,
- .sysfs_ops = &btrfs_super_attr_ops,
- .release = btrfs_super_release,
-};
-
/* /sys/fs/btrfs/ entry */
static struct kset *btrfs_kset;
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
-{
- int error;
- char *name;
- char c;
- int len = strlen(fs->sb->s_id) + 1;
- int i;
-
- name = kmalloc(len, GFP_NOFS);
- if (!name) {
- error = -ENOMEM;
- goto fail;
- }
-
- for (i = 0; i < len; i++) {
- c = fs->sb->s_id[i];
- if (c == '/' || c == '\\')
- c = '!';
- name[i] = c;
- }
- name[len] = '\0';
-
- fs->super_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
- NULL, "%s", name);
- kfree(name);
- if (error)
- goto fail;
-
- return 0;
-
-fail:
- printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
- return error;
-}
-
-int btrfs_sysfs_add_root(struct btrfs_root *root)
-{
- int error;
-
- error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
- &root->fs_info->super_kobj,
- "%s", root->name);
- if (error)
- goto fail;
-
- return 0;
-
-fail:
- printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
- return error;
-}
-
-void btrfs_sysfs_del_root(struct btrfs_root *root)
-{
- kobject_put(&root->root_kobj);
- wait_for_completion(&root->kobj_unregister);
-}
-
-void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
-{
- kobject_put(&fs->super_kobj);
- wait_for_completion(&fs->kobj_unregister);
-}
-
int btrfs_init_sysfs(void)
{
btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3d73c8d93bbb..dc80f7156923 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -27,15 +27,14 @@
#include "transaction.h"
#include "locking.h"
#include "tree-log.h"
+#include "inode-map.h"
#define BTRFS_ROOT_TRANS_TAG 0
static noinline void put_transaction(struct btrfs_transaction *transaction)
{
- WARN_ON(transaction->use_count == 0);
- transaction->use_count--;
- if (transaction->use_count == 0) {
- list_del_init(&transaction->list);
+ WARN_ON(atomic_read(&transaction->use_count) == 0);
+ if (atomic_dec_and_test(&transaction->use_count)) {
memset(transaction, 0, sizeof(*transaction));
kmem_cache_free(btrfs_transaction_cachep, transaction);
}
@@ -57,16 +56,17 @@ static noinline int join_transaction(struct btrfs_root *root)
if (!cur_trans) {
cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
GFP_NOFS);
- BUG_ON(!cur_trans);
+ if (!cur_trans)
+ return -ENOMEM;
root->fs_info->generation++;
- cur_trans->num_writers = 1;
+ atomic_set(&cur_trans->num_writers, 1);
cur_trans->num_joined = 0;
cur_trans->transid = root->fs_info->generation;
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
cur_trans->in_commit = 0;
cur_trans->blocked = 0;
- cur_trans->use_count = 1;
+ atomic_set(&cur_trans->use_count, 1);
cur_trans->commit_done = 0;
cur_trans->start_time = get_seconds();
@@ -81,13 +81,12 @@ static noinline int join_transaction(struct btrfs_root *root)
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
- root->fs_info->btree_inode->i_mapping,
- GFP_NOFS);
+ root->fs_info->btree_inode->i_mapping);
spin_lock(&root->fs_info->new_trans_lock);
root->fs_info->running_transaction = cur_trans;
spin_unlock(&root->fs_info->new_trans_lock);
} else {
- cur_trans->num_writers++;
+ atomic_inc(&cur_trans->num_writers);
cur_trans->num_joined++;
}
@@ -144,7 +143,7 @@ static void wait_current_trans(struct btrfs_root *root)
cur_trans = root->fs_info->running_transaction;
if (cur_trans && cur_trans->blocked) {
DEFINE_WAIT(wait);
- cur_trans->use_count++;
+ atomic_inc(&cur_trans->use_count);
while (1) {
prepare_to_wait(&root->fs_info->transaction_wait, &wait,
TASK_UNINTERRUPTIBLE);
@@ -180,6 +179,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
{
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
+ int retries = 0;
int ret;
if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -195,10 +195,15 @@ again:
wait_current_trans(root);
ret = join_transaction(root);
- BUG_ON(ret);
+ if (ret < 0) {
+ kmem_cache_free(btrfs_trans_handle_cachep, h);
+ if (type != TRANS_JOIN_NOLOCK)
+ mutex_unlock(&root->fs_info->trans_mutex);
+ return ERR_PTR(ret);
+ }
cur_trans = root->fs_info->running_transaction;
- cur_trans->use_count++;
+ atomic_inc(&cur_trans->use_count);
if (type != TRANS_JOIN_NOLOCK)
mutex_unlock(&root->fs_info->trans_mutex);
@@ -218,10 +223,18 @@ again:
if (num_items > 0) {
ret = btrfs_trans_reserve_metadata(h, root, num_items);
- if (ret == -EAGAIN) {
+ if (ret == -EAGAIN && !retries) {
+ retries++;
btrfs_commit_transaction(h, root);
goto again;
+ } else if (ret == -EAGAIN) {
+ /*
+ * We have already retried and got EAGAIN, so really we
+ * don't have space, so set ret to -ENOSPC.
+ */
+ ret = -ENOSPC;
}
+
if (ret < 0) {
btrfs_end_transaction(h, root);
return ERR_PTR(ret);
@@ -321,7 +334,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
goto out_unlock; /* nothing committing|committed */
}
- cur_trans->use_count++;
+ atomic_inc(&cur_trans->use_count);
mutex_unlock(&root->fs_info->trans_mutex);
wait_for_commit(root, cur_trans);
@@ -334,49 +347,6 @@ out_unlock:
return ret;
}
-#if 0
-/*
- * rate limit against the drop_snapshot code. This helps to slow down new
- * operations if the drop_snapshot code isn't able to keep up.
- */
-static void throttle_on_drops(struct btrfs_root *root)
-{
- struct btrfs_fs_info *info = root->fs_info;
- int harder_count = 0;
-
-harder:
- if (atomic_read(&info->throttles)) {
- DEFINE_WAIT(wait);
- int thr;
- thr = atomic_read(&info->throttle_gen);
-
- do {
- prepare_to_wait(&info->transaction_throttle,
- &wait, TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&info->throttles)) {
- finish_wait(&info->transaction_throttle, &wait);
- break;
- }
- schedule();
- finish_wait(&info->transaction_throttle, &wait);
- } while (thr == atomic_read(&info->throttle_gen));
- harder_count++;
-
- if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
- harder_count < 2)
- goto harder;
-
- if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
- harder_count < 10)
- goto harder;
-
- if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
- harder_count < 20)
- goto harder;
- }
-}
-#endif
-
void btrfs_throttle(struct btrfs_root *root)
{
mutex_lock(&root->fs_info->trans_mutex);
@@ -451,18 +421,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
wake_up_process(info->transaction_kthread);
}
- if (lock)
- mutex_lock(&info->trans_mutex);
WARN_ON(cur_trans != info->running_transaction);
- WARN_ON(cur_trans->num_writers < 1);
- cur_trans->num_writers--;
+ WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
+ atomic_dec(&cur_trans->num_writers);
smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
wake_up(&cur_trans->writer_wait);
put_transaction(cur_trans);
- if (lock)
- mutex_unlock(&info->trans_mutex);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -478,19 +444,40 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- return __btrfs_end_transaction(trans, root, 0, 1);
+ int ret;
+
+ ret = __btrfs_end_transaction(trans, root, 0, 1);
+ if (ret)
+ return ret;
+ return 0;
}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- return __btrfs_end_transaction(trans, root, 1, 1);
+ int ret;
+
+ ret = __btrfs_end_transaction(trans, root, 1, 1);
+ if (ret)
+ return ret;
+ return 0;
}
int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- return __btrfs_end_transaction(trans, root, 0, 0);
+ int ret;
+
+ ret = __btrfs_end_transaction(trans, root, 0, 0);
+ if (ret)
+ return ret;
+ return 0;
+}
+
+int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ return __btrfs_end_transaction(trans, root, 1, 1);
}
/*
@@ -751,8 +738,14 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_update_reloc_root(trans, root);
btrfs_orphan_commit_root(trans, root);
+ btrfs_save_ino_cache(root, trans);
+
if (root->commit_root != root->node) {
+ mutex_lock(&root->fs_commit_mutex);
switch_commit_root(root);
+ btrfs_unpin_free_ino(root);
+ mutex_unlock(&root->fs_commit_mutex);
+
btrfs_set_root_node(&root->root_item,
root->node);
}
@@ -800,97 +793,6 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
return ret;
}
-#if 0
-/*
- * when dropping snapshots, we generate a ton of delayed refs, and it makes
- * sense not to join the transaction while it is trying to flush the current
- * queue of delayed refs out.
- *
- * This is used by the drop snapshot code only
- */
-static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
-{
- DEFINE_WAIT(wait);
-
- mutex_lock(&info->trans_mutex);
- while (info->running_transaction &&
- info->running_transaction->delayed_refs.flushing) {
- prepare_to_wait(&info->transaction_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- mutex_unlock(&info->trans_mutex);
-
- schedule();
-
- mutex_lock(&info->trans_mutex);
- finish_wait(&info->transaction_wait, &wait);
- }
- mutex_unlock(&info->trans_mutex);
- return 0;
-}
-
-/*
- * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
- * all of them
- */
-int btrfs_drop_dead_root(struct btrfs_root *root)
-{
- struct btrfs_trans_handle *trans;
- struct btrfs_root *tree_root = root->fs_info->tree_root;
- unsigned long nr;
- int ret;
-
- while (1) {
- /*
- * we don't want to jump in and create a bunch of
- * delayed refs if the transaction is starting to close
- */
- wait_transaction_pre_flush(tree_root->fs_info);
- trans = btrfs_start_transaction(tree_root, 1);
-
- /*
- * we've joined a transaction, make sure it isn't
- * closing right now
- */
- if (trans->transaction->delayed_refs.flushing) {
- btrfs_end_transaction(trans, tree_root);
- continue;
- }
-
- ret = btrfs_drop_snapshot(trans, root);
- if (ret != -EAGAIN)
- break;
-
- ret = btrfs_update_root(trans, tree_root,
- &root->root_key,
- &root->root_item);
- if (ret)
- break;
-
- nr = trans->blocks_used;
- ret = btrfs_end_transaction(trans, tree_root);
- BUG_ON(ret);
-
- btrfs_btree_balance_dirty(tree_root, nr);
- cond_resched();
- }
- BUG_ON(ret);
-
- ret = btrfs_del_root(trans, tree_root, &root->root_key);
- BUG_ON(ret);
-
- nr = trans->blocks_used;
- ret = btrfs_end_transaction(trans, tree_root);
- BUG_ON(ret);
-
- free_extent_buffer(root->node);
- free_extent_buffer(root->commit_root);
- kfree(root);
-
- btrfs_btree_balance_dirty(tree_root, nr);
- return ret;
-}
-#endif
-
/*
* new snapshots need to be created at a very specific time in the
* transaction commit. This does the actual creation
@@ -921,7 +823,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+ ret = btrfs_find_free_objectid(tree_root, &objectid);
if (ret) {
pending->error = ret;
goto fail;
@@ -958,7 +860,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
BUG_ON(ret);
ret = btrfs_insert_dir_item(trans, parent_root,
dentry->d_name.name, dentry->d_name.len,
- parent_inode->i_ino, &key,
+ parent_inode, &key,
BTRFS_FT_DIR, index);
BUG_ON(ret);
@@ -970,6 +872,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
record_root_in_trans(trans, root);
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+ btrfs_check_and_init_root_item(new_root_item);
root_flags = btrfs_root_flags(new_root_item);
if (pending->readonly)
@@ -999,7 +902,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
ret = btrfs_add_root_ref(trans, tree_root, objectid,
parent_root->root_key.objectid,
- parent_inode->i_ino, index,
+ btrfs_ino(parent_inode), index,
dentry->d_name.name, dentry->d_name.len);
BUG_ON(ret);
dput(parent);
@@ -1027,6 +930,14 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
int ret;
list_for_each_entry(pending, head, list) {
+ /*
+ * We must deal with the delayed items before creating
+ * snapshots, or we will create a snapthot with inconsistent
+ * information.
+ */
+ ret = btrfs_run_delayed_items(trans, fs_info->fs_root);
+ BUG_ON(ret);
+
ret = create_pending_snapshot(trans, fs_info, pending);
BUG_ON(ret);
}
@@ -1156,7 +1067,8 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
struct btrfs_transaction *cur_trans;
ac = kmalloc(sizeof(*ac), GFP_NOFS);
- BUG_ON(!ac);
+ if (!ac)
+ return -ENOMEM;
INIT_DELAYED_WORK(&ac->work, do_async_commit);
ac->root = root;
@@ -1170,7 +1082,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
/* take transaction reference */
mutex_lock(&root->fs_info->trans_mutex);
cur_trans = trans->transaction;
- cur_trans->use_count++;
+ atomic_inc(&cur_trans->use_count);
mutex_unlock(&root->fs_info->trans_mutex);
btrfs_end_transaction(trans, root);
@@ -1229,7 +1141,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->trans_mutex);
if (cur_trans->in_commit) {
- cur_trans->use_count++;
+ atomic_inc(&cur_trans->use_count);
mutex_unlock(&root->fs_info->trans_mutex);
btrfs_end_transaction(trans, root);
@@ -1251,7 +1163,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
if (!prev_trans->commit_done) {
- prev_trans->use_count++;
+ atomic_inc(&prev_trans->use_count);
mutex_unlock(&root->fs_info->trans_mutex);
wait_for_commit(root, prev_trans);
@@ -1279,6 +1191,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
+ ret = btrfs_run_delayed_items(trans, root);
+ BUG_ON(ret);
+
/*
* rename don't use btrfs_join_transaction, so, once we
* set the transaction to blocked above, we aren't going
@@ -1292,24 +1207,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
TASK_UNINTERRUPTIBLE);
smp_mb();
- if (cur_trans->num_writers > 1)
+ if (atomic_read(&cur_trans->num_writers) > 1)
schedule_timeout(MAX_SCHEDULE_TIMEOUT);
else if (should_grow)
schedule_timeout(1);
mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&cur_trans->writer_wait, &wait);
- } while (cur_trans->num_writers > 1 ||
+ } while (atomic_read(&cur_trans->num_writers) > 1 ||
(should_grow && cur_trans->num_joined != joined));
ret = create_pending_snapshots(trans, root->fs_info);
BUG_ON(ret);
+ ret = btrfs_run_delayed_items(trans, root);
+ BUG_ON(ret);
+
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
BUG_ON(ret);
WARN_ON(cur_trans != trans->transaction);
+ btrfs_scrub_pause(root);
/* btrfs_commit_tree_roots is responsible for getting the
* various roots consistent with each other. Every pointer
* in the tree of tree roots has to point to the most up to date
@@ -1386,11 +1305,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wake_up(&cur_trans->commit_wait);
+ list_del_init(&cur_trans->list);
put_transaction(cur_trans);
put_transaction(cur_trans);
+ trace_btrfs_transaction_commit(root);
+
mutex_unlock(&root->fs_info->trans_mutex);
+ btrfs_scrub_continue(root);
+
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -1418,6 +1342,8 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
root = list_entry(list.next, struct btrfs_root, root_list);
list_del(&root->root_list);
+ btrfs_kill_all_delayed_nodes(root);
+
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
btrfs_drop_snapshot(root, NULL, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 229a594cacd5..804c88639e5d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,11 +27,11 @@ struct btrfs_transaction {
* total writers in this transaction, it must be zero before the
* transaction can end
*/
- unsigned long num_writers;
+ atomic_t num_writers;
unsigned long num_joined;
int in_commit;
- int use_count;
+ atomic_t use_count;
int commit_done;
int blocked;
struct list_head list;
@@ -101,11 +101,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_drop_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
int btrfs_clean_old_snapshots(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -115,6 +112,8 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_throttle(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 992ab425599d..3b580ee8ab1d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -97,7 +97,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
ret = 0;
goto out;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (wret < 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a4bbb854dfd2..592396c6dc47 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -333,13 +333,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
goto insert;
if (item_size == 0) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return 0;
}
dst_copy = kmalloc(item_size, GFP_NOFS);
src_copy = kmalloc(item_size, GFP_NOFS);
if (!dst_copy || !src_copy) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
kfree(dst_copy);
kfree(src_copy);
return -ENOMEM;
@@ -361,13 +361,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
* sync
*/
if (ret == 0) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return 0;
}
}
insert:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
/* try to insert the key into the destination tree */
ret = btrfs_insert_empty_item(trans, root, path,
key, item_size);
@@ -382,7 +382,6 @@ insert:
} else if (found_size < item_size) {
ret = btrfs_extend_item(trans, root, path,
item_size - found_size);
- BUG_ON(ret);
}
} else if (ret) {
return ret;
@@ -438,7 +437,7 @@ insert:
}
no_copy:
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return 0;
}
@@ -519,7 +518,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
start, 0);
if (ret == 0 &&
@@ -544,11 +543,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
* we don't have to do anything
*/
if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto out;
}
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
saved_nbytes = inode_get_bytes(inode);
/* drop any overlapping extents */
@@ -590,6 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ins.objectid, ins.offset,
0, root->root_key.objectid,
key->objectid, offset);
+ BUG_ON(ret);
} else {
/*
* insert the extent pointer in the extent
@@ -600,7 +600,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
key->objectid, offset, &ins);
BUG_ON(ret);
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (btrfs_file_extent_compression(eb, item)) {
csum_start = ins.objectid;
@@ -614,7 +614,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(root->log_root,
csum_start, csum_end - 1,
- &ordered_sums);
+ &ordered_sums, 0);
BUG_ON(ret);
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums;
@@ -629,7 +629,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
kfree(sums);
}
} else {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
}
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
/* inline extents are easy, we just overwrite them */
@@ -675,10 +675,13 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
return -ENOMEM;
read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
inode = read_one_inode(root, location.objectid);
- BUG_ON(!inode);
+ if (!inode) {
+ kfree(name);
+ return -EIO;
+ }
ret = link_to_fixup_dir(trans, root, path, location.objectid);
BUG_ON(ret);
@@ -713,7 +716,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
goto out;
} else
goto out;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
if (di && !IS_ERR(di)) {
@@ -724,7 +727,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
goto out;
match = 1;
out:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return match;
}
@@ -799,12 +802,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct inode *dir;
int ret;
struct btrfs_inode_ref *ref;
- struct btrfs_dir_item *di;
struct inode *inode;
char *name;
int namelen;
unsigned long ref_ptr;
unsigned long ref_end;
+ int search_done = 0;
/*
* it is possible that we didn't log all the parent directories
@@ -817,7 +820,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
return -ENOENT;
inode = read_one_inode(root, key->objectid);
- BUG_ON(!inode);
+ if (!inode) {
+ iput(dir);
+ return -EIO;
+ }
ref_ptr = btrfs_item_ptr_offset(eb, slot);
ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
@@ -832,7 +838,7 @@ again:
read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
/* if we already have a perfect match, we're done */
- if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+ if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
btrfs_inode_ref_index(eb, ref),
name, namelen)) {
goto out;
@@ -845,7 +851,10 @@ again:
* existing back reference, and we don't want to create
* dangling pointers in the directory.
*/
-conflict_again:
+
+ if (search_done)
+ goto insert;
+
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret == 0) {
char *victim_name;
@@ -881,42 +890,26 @@ conflict_again:
if (!backref_in_log(log, key, victim_name,
victim_name_len)) {
btrfs_inc_nlink(inode);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = btrfs_unlink_inode(trans, root, dir,
inode, victim_name,
victim_name_len);
- kfree(victim_name);
- btrfs_release_path(root, path);
- goto conflict_again;
}
kfree(victim_name);
ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
}
BUG_ON(ret);
- }
- btrfs_release_path(root, path);
-
- /* look for a conflicting sequence number */
- di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
- btrfs_inode_ref_index(eb, ref),
- name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
- BUG_ON(ret);
- }
- btrfs_release_path(root, path);
-
- /* look for a conflicting name */
- di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
- name, namelen, 0);
- if (di && !IS_ERR(di)) {
- ret = drop_one_dir_item(trans, root, path, dir, di);
- BUG_ON(ret);
+ /*
+ * NOTE: we have searched root tree and checked the
+ * coresponding ref, it does not need to check again.
+ */
+ search_done = 1;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
+insert:
/* insert our name */
ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
btrfs_inode_ref_index(eb, ref));
@@ -935,7 +928,7 @@ out:
BUG_ON(ret);
out_nowrite:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
iput(dir);
iput(inode);
return 0;
@@ -973,8 +966,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
unsigned long ptr;
unsigned long ptr_end;
int name_len;
+ u64 ino = btrfs_ino(inode);
- key.objectid = inode->i_ino;
+ key.objectid = ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
@@ -993,7 +987,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
}
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
- if (key.objectid != inode->i_ino ||
+ if (key.objectid != ino ||
key.type != BTRFS_INODE_REF_KEY)
break;
ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
@@ -1012,9 +1006,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (key.offset == 0)
break;
key.offset--;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (nlink != inode->i_nlink) {
inode->i_nlink = nlink;
btrfs_update_inode(trans, root, inode);
@@ -1024,10 +1018,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (inode->i_nlink == 0) {
if (S_ISDIR(inode->i_mode)) {
ret = replay_dir_deletes(trans, root, NULL, path,
- inode->i_ino, 1);
+ ino, 1);
BUG_ON(ret);
}
- ret = insert_orphan_item(trans, root, inode->i_ino);
+ ret = insert_orphan_item(trans, root, ino);
BUG_ON(ret);
}
btrfs_free_path(path);
@@ -1063,11 +1057,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
ret = btrfs_del_item(trans, root, path);
- BUG_ON(ret);
+ if (ret)
+ goto out;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
- BUG_ON(!inode);
+ if (!inode)
+ return -EIO;
ret = fixup_inode_link_count(trans, root, inode);
BUG_ON(ret);
@@ -1081,8 +1077,10 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
*/
key.offset = (u64)-1;
}
- btrfs_release_path(root, path);
- return 0;
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ return ret;
}
@@ -1101,7 +1099,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct inode *inode;
inode = read_one_inode(root, objectid);
- BUG_ON(!inode);
+ if (!inode)
+ return -EIO;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
@@ -1109,7 +1108,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (ret == 0) {
btrfs_inc_nlink(inode);
btrfs_update_inode(trans, root, inode);
@@ -1188,7 +1187,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
int ret;
dir = read_one_inode(root, key->objectid);
- BUG_ON(!dir);
+ if (!dir)
+ return -EIO;
name_len = btrfs_dir_name_len(eb, di);
name = kmalloc(name_len, GFP_NOFS);
@@ -1205,7 +1205,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
exists = 1;
else
exists = 0;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (key->type == BTRFS_DIR_ITEM_KEY) {
dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1218,7 +1218,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
} else {
BUG();
}
- if (!dst_di || IS_ERR(dst_di)) {
+ if (IS_ERR_OR_NULL(dst_di)) {
/* we need a sequence number to insert, so we only
* do inserts for the BTRFS_DIR_INDEX_KEY types
*/
@@ -1249,13 +1249,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
if (key->type == BTRFS_DIR_INDEX_KEY)
goto insert;
out:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
kfree(name);
iput(dir);
return 0;
insert:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = insert_one_name(trans, root, path, key->objectid, key->offset,
name, name_len, log_type, &log_key);
@@ -1286,6 +1286,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
+ if (verify_dir_item(root, eb, di))
+ return -EIO;
name_len = btrfs_dir_name_len(eb, di);
ret = replay_one_name(trans, root, path, eb, di, key);
BUG_ON(ret);
@@ -1374,7 +1376,7 @@ next:
*end_ret = found_end;
ret = 0;
out:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return ret;
}
@@ -1412,6 +1414,11 @@ again:
ptr_end = ptr + item_size;
while (ptr < ptr_end) {
di = (struct btrfs_dir_item *)ptr;
+ if (verify_dir_item(root, eb, di)) {
+ ret = -EIO;
+ goto out;
+ }
+
name_len = btrfs_dir_name_len(eb, di);
name = kmalloc(name_len, GFP_NOFS);
if (!name) {
@@ -1432,12 +1439,15 @@ again:
dir_key->offset,
name, name_len, 0);
}
- if (!log_di || IS_ERR(log_di)) {
+ if (IS_ERR_OR_NULL(log_di)) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
- btrfs_release_path(root, path);
- btrfs_release_path(log, log_path);
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
inode = read_one_inode(root, location.objectid);
- BUG_ON(!inode);
+ if (!inode) {
+ kfree(name);
+ return -EIO;
+ }
ret = link_to_fixup_dir(trans, root,
path, location.objectid);
@@ -1459,7 +1469,7 @@ again:
ret = 0;
goto out;
}
- btrfs_release_path(log, log_path);
+ btrfs_release_path(log_path);
kfree(name);
ptr = (unsigned long)(di + 1);
@@ -1467,8 +1477,8 @@ again:
}
ret = 0;
out:
- btrfs_release_path(root, path);
- btrfs_release_path(log, log_path);
+ btrfs_release_path(path);
+ btrfs_release_path(log_path);
return ret;
}
@@ -1556,7 +1566,7 @@ again:
break;
dir_key.offset = found_key.offset + 1;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (range_end == (u64)-1)
break;
range_start = range_end + 1;
@@ -1567,11 +1577,11 @@ next_type:
if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
key_type = BTRFS_DIR_LOG_INDEX_KEY;
dir_key.type = BTRFS_DIR_INDEX_KEY;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto again;
}
out:
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
btrfs_free_path(log_path);
iput(dir);
return ret;
@@ -1821,7 +1831,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
int orig_level;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
level = btrfs_header_level(log->node);
orig_level = level;
@@ -2098,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* the running transaction open, so a full commit can't hop
* in and cause problems either.
*/
+ btrfs_scrub_pause_super(root);
write_ctree_super(trans, root->fs_info->tree_root, 1);
+ btrfs_scrub_continue_super(root);
ret = 0;
mutex_lock(&root->log_mutex);
@@ -2202,6 +2215,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
int ret;
int err = 0;
int bytes_del = 0;
+ u64 dir_ino = btrfs_ino(dir);
if (BTRFS_I(dir)->logged_trans < trans->transid)
return 0;
@@ -2214,10 +2228,12 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
log = root->log_root;
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
- di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+ di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
name, name_len, -1);
if (IS_ERR(di)) {
err = PTR_ERR(di);
@@ -2228,8 +2244,8 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
bytes_del += name_len;
BUG_ON(ret);
}
- btrfs_release_path(log, path);
- di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+ btrfs_release_path(path);
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
index, name, name_len, -1);
if (IS_ERR(di)) {
err = PTR_ERR(di);
@@ -2247,10 +2263,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
if (bytes_del) {
struct btrfs_key key;
- key.objectid = dir->i_ino;
+ key.objectid = dir_ino;
key.offset = 0;
key.type = BTRFS_INODE_ITEM_KEY;
- btrfs_release_path(log, path);
+ btrfs_release_path(path);
ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
if (ret < 0) {
@@ -2272,10 +2288,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
} else
ret = 0;
- btrfs_release_path(log, path);
+ btrfs_release_path(path);
}
fail:
btrfs_free_path(path);
+out_unlock:
mutex_unlock(&BTRFS_I(dir)->log_mutex);
if (ret == -ENOSPC) {
root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -2305,7 +2322,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
log = root->log_root;
mutex_lock(&BTRFS_I(inode)->log_mutex);
- ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+ ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
if (ret == -ENOSPC) {
@@ -2346,7 +2363,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_dir_log_item);
btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(log, path);
+ btrfs_release_path(path);
return 0;
}
@@ -2371,13 +2388,14 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
int nritems;
u64 first_offset = min_offset;
u64 last_offset = (u64)-1;
+ u64 ino = btrfs_ino(inode);
log = root->log_root;
- max_key.objectid = inode->i_ino;
+ max_key.objectid = ino;
max_key.offset = (u64)-1;
max_key.type = key_type;
- min_key.objectid = inode->i_ino;
+ min_key.objectid = ino;
min_key.type = key_type;
min_key.offset = min_offset;
@@ -2390,18 +2408,17 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* we didn't find anything from this transaction, see if there
* is anything at all
*/
- if (ret != 0 || min_key.objectid != inode->i_ino ||
- min_key.type != key_type) {
- min_key.objectid = inode->i_ino;
+ if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
+ min_key.objectid = ino;
min_key.type = key_type;
min_key.offset = (u64)-1;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
if (ret < 0) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
return ret;
}
- ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, key_type);
/* if ret == 0 there are items for this type,
* create a range to tell us the last key of this type.
@@ -2419,7 +2436,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
}
/* go backward to find any previous key */
- ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+ ret = btrfs_previous_item(root, path, ino, key_type);
if (ret == 0) {
struct btrfs_key tmp;
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
@@ -2434,7 +2451,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
}
}
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
/* find the first key from this transaction again */
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -2454,8 +2471,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
for (i = path->slots[0]; i < nritems; i++) {
btrfs_item_key_to_cpu(src, &min_key, i);
- if (min_key.objectid != inode->i_ino ||
- min_key.type != key_type)
+ if (min_key.objectid != ino || min_key.type != key_type)
goto done;
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
@@ -2476,7 +2492,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
goto done;
}
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+ if (tmp.objectid != ino || tmp.type != key_type) {
last_offset = (u64)-1;
goto done;
}
@@ -2492,8 +2508,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
}
}
done:
- btrfs_release_path(root, path);
- btrfs_release_path(log, dst_path);
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
if (err == 0) {
*last_offset_ret = last_offset;
@@ -2502,8 +2518,7 @@ done:
* is valid
*/
ret = insert_dir_log_key(trans, log, path, key_type,
- inode->i_ino, first_offset,
- last_offset);
+ ino, first_offset, last_offset);
if (ret)
err = ret;
}
@@ -2589,10 +2604,11 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
break;
ret = btrfs_del_item(trans, log, path);
- BUG_ON(ret);
- btrfs_release_path(log, path);
+ if (ret)
+ break;
+ btrfs_release_path(path);
}
- btrfs_release_path(log, path);
+ btrfs_release_path(path);
return ret;
}
@@ -2667,6 +2683,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
extent = btrfs_item_ptr(src, start_slot + i,
struct btrfs_file_extent_item);
+ if (btrfs_file_extent_generation(src, extent) < trans->transid)
+ continue;
+
found_type = btrfs_file_extent_type(src, extent);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -2691,14 +2710,14 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_csums_range(
log->fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
- &ordered_sums);
+ &ordered_sums, 0);
BUG_ON(ret);
}
}
}
btrfs_mark_buffer_dirty(dst_path->nodes[0]);
- btrfs_release_path(log, dst_path);
+ btrfs_release_path(dst_path);
kfree(ins_data);
/*
@@ -2747,6 +2766,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
int nritems;
int ins_start_slot = 0;
int ins_nr;
+ u64 ino = btrfs_ino(inode);
log = root->log_root;
@@ -2759,11 +2779,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
return -ENOMEM;
}
- min_key.objectid = inode->i_ino;
+ min_key.objectid = ino;
min_key.type = BTRFS_INODE_ITEM_KEY;
min_key.offset = 0;
- max_key.objectid = inode->i_ino;
+ max_key.objectid = ino;
/* today the code can only do partial logging of directories */
if (!S_ISDIR(inode->i_mode))
@@ -2775,6 +2795,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
+ ret = btrfs_commit_inode_delayed_items(trans, inode);
+ if (ret) {
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
+ }
+
mutex_lock(&BTRFS_I(inode)->log_mutex);
/*
@@ -2786,8 +2813,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_objectid_items(trans, log, path,
- inode->i_ino, max_key_type);
+ ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
}
@@ -2805,7 +2831,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
break;
again:
/* note, ins_nr might be > 0 here, cleanup outside the loop */
- if (min_key.objectid != inode->i_ino)
+ if (min_key.objectid != ino)
break;
if (min_key.type > max_key.type)
break;
@@ -2847,7 +2873,7 @@ next_slot:
}
ins_nr = 0;
}
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (min_key.offset < (u64)-1)
min_key.offset++;
@@ -2870,8 +2896,8 @@ next_slot:
}
WARN_ON(ins_nr);
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
- btrfs_release_path(root, path);
- btrfs_release_path(log, dst_path);
+ btrfs_release_path(path);
+ btrfs_release_path(dst_path);
ret = log_directory_changes(trans, root, inode, path, dst_path);
if (ret) {
err = ret;
@@ -3107,9 +3133,11 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
.stage = 0,
};
- fs_info->log_root_recovering = 1;
path = btrfs_alloc_path();
- BUG_ON(!path);
+ if (!path)
+ return -ENOMEM;
+
+ fs_info->log_root_recovering = 1;
trans = btrfs_start_transaction(fs_info->tree_root, 0);
BUG_ON(IS_ERR(trans));
@@ -3117,7 +3145,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
wc.trans = trans;
wc.pin = 1;
- walk_log_tree(trans, log_root_tree, &wc);
+ ret = walk_log_tree(trans, log_root_tree, &wc);
+ BUG_ON(ret);
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3135,14 +3164,13 @@ again:
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
- btrfs_release_path(log_root_tree, path);
+ btrfs_release_path(path);
if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
break;
log = btrfs_read_fs_root_no_radix(log_root_tree,
&found_key);
- BUG_ON(!log);
-
+ BUG_ON(IS_ERR(log));
tmp_key.objectid = found_key.offset;
tmp_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -3171,7 +3199,7 @@ again:
if (found_key.offset == 0)
break;
}
- btrfs_release_path(log_root_tree, path);
+ btrfs_release_path(path);
/* step one is to pin it all, step two is to replay just inodes */
if (wc.pin) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 3dfae84c8cc8..2270ac58d746 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -38,7 +38,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct inode *inode, u64 dirid);
-int btrfs_join_running_log_trans(struct btrfs_root *root);
int btrfs_end_log_trans(struct btrfs_root *root);
int btrfs_pin_log_trans(struct btrfs_root *root);
int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
deleted file mode 100644
index 1ca1952fd917..000000000000
--- a/fs/btrfs/version.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-#
-# determine-version -- report a useful version for releases
-#
-# Copyright 2008, Aron Griffis <agriffis@n01se.net>
-# Copyright 2008, Oracle
-# Released under the GNU GPLv2
-
-v="v0.16"
-
-which git &> /dev/null
-if [ $? == 0 ]; then
- git branch >& /dev/null
- if [ $? == 0 ]; then
- if head=`git rev-parse --verify HEAD 2>/dev/null`; then
- if tag=`git describe --tags 2>/dev/null`; then
- v="$tag"
- fi
-
- # Are there uncommitted changes?
- git update-index --refresh --unmerged > /dev/null
- if git diff-index --name-only HEAD | \
- grep -v "^scripts/package" \
- | read dummy; then
- v="$v"-dirty
- fi
- fi
- fi
-fi
-
-echo "#ifndef __BUILD_VERSION" > .build-version.h
-echo "#define __BUILD_VERSION" >> .build-version.h
-echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
-echo "#endif" >> .build-version.h
-
-diff -q version.h .build-version.h >& /dev/null
-
-if [ $? == 0 ]; then
- rm .build-version.h
- exit 0
-fi
-
-mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2636a051e4b2..c48214ef5c09 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -33,38 +33,14 @@
#include "volumes.h"
#include "async-thread.h"
-struct map_lookup {
- u64 type;
- int io_align;
- int io_width;
- int stripe_len;
- int sector_size;
- int num_stripes;
- int sub_stripes;
- struct btrfs_bio_stripe stripes[];
-};
-
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_bio_stripe) * (n)))
-
static DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
-void btrfs_lock_volumes(void)
-{
- mutex_lock(&uuid_mutex);
-}
-
-void btrfs_unlock_volumes(void)
-{
- mutex_unlock(&uuid_mutex);
-}
-
static void lock_chunks(struct btrfs_root *root)
{
mutex_lock(&root->fs_info->chunk_mutex);
@@ -162,22 +138,25 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
struct bio *cur;
int again = 0;
unsigned long num_run;
- unsigned long num_sync_run;
unsigned long batch_run = 0;
unsigned long limit;
unsigned long last_waited = 0;
int force_reg = 0;
+ struct blk_plug plug;
+
+ /*
+ * this function runs all the bios we've collected for
+ * a particular device. We don't want to wander off to
+ * another device without first sending all of these down.
+ * So, setup a plug here and finish it off before we return
+ */
+ blk_start_plug(&plug);
bdi = blk_get_backing_dev_info(device->bdev);
fs_info = device->dev_root->fs_info;
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
- /* we want to make sure that every time we switch from the sync
- * list to the normal list, we unplug
- */
- num_sync_run = 0;
-
loop:
spin_lock(&device->io_lock);
@@ -223,15 +202,6 @@ loop_lock:
spin_unlock(&device->io_lock);
- /*
- * if we're doing the regular priority list, make sure we unplug
- * for any high prio bios we've sent down
- */
- if (pending_bios == &device->pending_bios && num_sync_run > 0) {
- num_sync_run = 0;
- blk_run_backing_dev(bdi, NULL);
- }
-
while (pending) {
rmb();
@@ -259,19 +229,11 @@ loop_lock:
BUG_ON(atomic_read(&cur->bi_cnt) == 0);
- if (cur->bi_rw & REQ_SYNC)
- num_sync_run++;
-
submit_bio(cur->bi_rw, cur);
num_run++;
batch_run++;
- if (need_resched()) {
- if (num_sync_run) {
- blk_run_backing_dev(bdi, NULL);
- num_sync_run = 0;
- }
+ if (need_resched())
cond_resched();
- }
/*
* we made progress, there is more work to do and the bdi
@@ -304,13 +266,8 @@ loop_lock:
* against it before looping
*/
last_waited = ioc->last_waited;
- if (need_resched()) {
- if (num_sync_run) {
- blk_run_backing_dev(bdi, NULL);
- num_sync_run = 0;
- }
+ if (need_resched())
cond_resched();
- }
continue;
}
spin_lock(&device->io_lock);
@@ -323,22 +280,6 @@ loop_lock:
}
}
- if (num_sync_run) {
- num_sync_run = 0;
- blk_run_backing_dev(bdi, NULL);
- }
- /*
- * IO has already been through a long path to get here. Checksumming,
- * async helper threads, perhaps compression. We've done a pretty
- * good job of collecting a batch of IO and should just unplug
- * the device right away.
- *
- * This will help anyone who is waiting on the IO, they might have
- * already unplugged, but managed to do so before the bio they
- * cared about found its way down here.
- */
- blk_run_backing_dev(bdi, NULL);
-
cond_resched();
if (again)
goto loop;
@@ -349,6 +290,7 @@ loop_lock:
spin_unlock(&device->io_lock);
done:
+ blk_finish_plug(&plug);
return 0;
}
@@ -408,7 +350,7 @@ static noinline int device_list_add(const char *path,
INIT_LIST_HEAD(&device->dev_alloc_list);
mutex_lock(&fs_devices->device_list_mutex);
- list_add(&device->dev_list, &fs_devices->devices);
+ list_add_rcu(&device->dev_list, &fs_devices->devices);
mutex_unlock(&fs_devices->device_list_mutex);
device->fs_devices = fs_devices;
@@ -451,7 +393,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
fs_devices->latest_trans = orig->latest_trans;
memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
- mutex_lock(&orig->device_list_mutex);
+ /* We have held the volume lock, it is safe to get the devices. */
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
device = kzalloc(sizeof(*device), GFP_NOFS);
if (!device)
@@ -474,10 +416,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
- mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
- mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(-ENOMEM);
}
@@ -488,7 +428,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&uuid_mutex);
again:
- mutex_lock(&fs_devices->device_list_mutex);
+ /* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
if (device->in_fs_metadata)
continue;
@@ -508,7 +448,6 @@ again:
kfree(device->name);
kfree(device);
}
- mutex_unlock(&fs_devices->device_list_mutex);
if (fs_devices->seed) {
fs_devices = fs_devices->seed;
@@ -519,6 +458,29 @@ again:
return 0;
}
+static void __free_device(struct work_struct *work)
+{
+ struct btrfs_device *device;
+
+ device = container_of(work, struct btrfs_device, rcu_work);
+
+ if (device->bdev)
+ blkdev_put(device->bdev, device->mode);
+
+ kfree(device->name);
+ kfree(device);
+}
+
+static void free_device(struct rcu_head *head)
+{
+ struct btrfs_device *device;
+
+ device = container_of(head, struct btrfs_device, rcu);
+
+ INIT_WORK(&device->rcu_work, __free_device);
+ schedule_work(&device->rcu_work);
+}
+
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
@@ -526,20 +488,32 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
if (--fs_devices->opened > 0)
return 0;
+ mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- if (device->bdev) {
- blkdev_put(device->bdev, device->mode);
+ struct btrfs_device *new_device;
+
+ if (device->bdev)
fs_devices->open_devices--;
- }
+
if (device->writeable) {
list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
- device->bdev = NULL;
- device->writeable = 0;
- device->in_fs_metadata = 0;
+ new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
+ BUG_ON(!new_device);
+ memcpy(new_device, device, sizeof(*new_device));
+ new_device->name = kstrdup(device->name, GFP_NOFS);
+ BUG_ON(!new_device->name);
+ new_device->bdev = NULL;
+ new_device->writeable = 0;
+ new_device->in_fs_metadata = 0;
+ list_replace_rcu(&device->dev_list, &new_device->dev_list);
+
+ call_rcu(&device->rcu, free_device);
}
+ mutex_unlock(&fs_devices->device_list_mutex);
+
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
@@ -642,6 +616,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
list_add(&device->dev_alloc_list,
&fs_devices->alloc_list);
}
+ brelse(bh);
continue;
error_brelse:
@@ -860,10 +835,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
/* we don't want to overwrite the superblock on the drive,
* so we make sure to start at an offset of at least 1MB
*/
- search_start = 1024 * 1024;
-
- if (root->fs_info->alloc_start + num_bytes <= search_end)
- search_start = max(root->fs_info->alloc_start, search_start);
+ search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
max_hole_start = search_start;
max_hole_size = 0;
@@ -994,14 +966,14 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
if (ret > 0) {
ret = btrfs_previous_item(root, path, key.objectid,
BTRFS_DEV_EXTENT_KEY);
- BUG_ON(ret);
+ if (ret)
+ goto out;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
BUG_ON(found_key.offset > start || found_key.offset +
btrfs_dev_extent_length(leaf, extent) < start);
- ret = 0;
} else if (ret == 0) {
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1012,8 +984,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
if (device->bytes_used > 0)
device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
ret = btrfs_del_item(trans, root, path);
- BUG_ON(ret);
+out:
btrfs_free_path(path);
return ret;
}
@@ -1248,11 +1220,13 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
struct block_device *bdev;
struct buffer_head *bh = NULL;
struct btrfs_super_block *disk_super;
+ struct btrfs_fs_devices *cur_devices;
u64 all_avail;
u64 devid;
u64 num_devices;
u8 *dev_uuid;
int ret = 0;
+ bool clear_super = false;
mutex_lock(&uuid_mutex);
mutex_lock(&root->fs_info->volume_mutex);
@@ -1283,14 +1257,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
device = NULL;
devices = &root->fs_info->fs_devices->devices;
- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ /*
+ * It is safe to read the devices since the volume_mutex
+ * is held.
+ */
list_for_each_entry(tmp, devices, dev_list) {
if (tmp->in_fs_metadata && !tmp->bdev) {
device = tmp;
break;
}
}
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
bdev = NULL;
bh = NULL;
disk_super = NULL;
@@ -1332,28 +1308,33 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
}
if (device->writeable) {
+ lock_chunks(root);
list_del_init(&device->dev_alloc_list);
+ unlock_chunks(root);
root->fs_info->fs_devices->rw_devices--;
+ clear_super = true;
}
ret = btrfs_shrink_device(device, 0);
if (ret)
- goto error_brelse;
+ goto error_undo;
ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
if (ret)
- goto error_brelse;
+ goto error_undo;
device->in_fs_metadata = 0;
+ btrfs_scrub_cancel_dev(root, device);
/*
* the device list mutex makes sure that we don't change
* the device list while someone else is writing out all
* the device supers.
*/
+
+ cur_devices = device->fs_devices;
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- list_del_init(&device->dev_list);
- mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ list_del_rcu(&device->dev_list);
device->fs_devices->num_devices--;
@@ -1367,34 +1348,36 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev == root->fs_info->fs_devices->latest_bdev)
root->fs_info->fs_devices->latest_bdev = next_device->bdev;
- if (device->bdev) {
- blkdev_put(device->bdev, device->mode);
- device->bdev = NULL;
+ if (device->bdev)
device->fs_devices->open_devices--;
- }
+
+ call_rcu(&device->rcu, free_device);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
- if (device->fs_devices->open_devices == 0) {
+ if (cur_devices->open_devices == 0) {
struct btrfs_fs_devices *fs_devices;
fs_devices = root->fs_info->fs_devices;
while (fs_devices) {
- if (fs_devices->seed == device->fs_devices)
+ if (fs_devices->seed == cur_devices)
break;
fs_devices = fs_devices->seed;
}
- fs_devices->seed = device->fs_devices->seed;
- device->fs_devices->seed = NULL;
- __btrfs_close_devices(device->fs_devices);
- free_fs_devices(device->fs_devices);
+ fs_devices->seed = cur_devices->seed;
+ cur_devices->seed = NULL;
+ lock_chunks(root);
+ __btrfs_close_devices(cur_devices);
+ unlock_chunks(root);
+ free_fs_devices(cur_devices);
}
/*
* at this point, the device is zero sized. We want to
* remove it from the devices list and zero out the old super
*/
- if (device->writeable) {
+ if (clear_super) {
/* make sure this device isn't detected as part of
* the FS anymore
*/
@@ -1403,8 +1386,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
sync_dirty_buffer(bh);
}
- kfree(device->name);
- kfree(device);
ret = 0;
error_brelse:
@@ -1416,6 +1397,15 @@ out:
mutex_unlock(&root->fs_info->volume_mutex);
mutex_unlock(&uuid_mutex);
return ret;
+error_undo:
+ if (device->writeable) {
+ lock_chunks(root);
+ list_add(&device->dev_alloc_list,
+ &root->fs_info->fs_devices->alloc_list);
+ unlock_chunks(root);
+ root->fs_info->fs_devices->rw_devices++;
+ }
+ goto error_brelse;
}
/*
@@ -1452,7 +1442,12 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
INIT_LIST_HEAD(&seed_devices->devices);
INIT_LIST_HEAD(&seed_devices->alloc_list);
mutex_init(&seed_devices->device_list_mutex);
- list_splice_init(&fs_devices->devices, &seed_devices->devices);
+
+ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+ list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
+ synchronize_rcu);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
list_for_each_entry(device, &seed_devices->devices, dev_list) {
device->fs_devices = seed_devices;
@@ -1513,7 +1508,7 @@ next_slot:
goto error;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
continue;
}
@@ -1605,12 +1600,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = find_next_devid(root, &device->devid);
if (ret) {
+ kfree(device->name);
kfree(device);
goto error;
}
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
+ kfree(device->name);
kfree(device);
ret = PTR_ERR(trans);
goto error;
@@ -1631,7 +1628,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->dev_root = root->fs_info->dev_root;
device->bdev = bdev;
device->in_fs_metadata = 1;
- device->mode = 0;
+ device->mode = FMODE_EXCL;
set_blocksize(device->bdev, 4096);
if (seeding_dev) {
@@ -1647,7 +1644,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
* half setup
*/
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
- list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+ list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
list_add(&device->dev_alloc_list,
&root->fs_info->fs_devices->alloc_list);
root->fs_info->fs_devices->num_devices++;
@@ -1805,10 +1802,9 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
BUG_ON(ret);
ret = btrfs_del_item(trans, root, path);
- BUG_ON(ret);
btrfs_free_path(path);
- return 0;
+ return ret;
}
static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
@@ -1914,6 +1910,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
BUG_ON(ret);
+ trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
+
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
BUG_ON(ret);
@@ -1981,7 +1979,7 @@ again:
chunk = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_chunk);
chunk_type = btrfs_chunk_type(leaf, chunk);
- btrfs_release_path(chunk_root, path);
+ btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
@@ -2099,7 +2097,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
if (found_key.offset == 0)
break;
- btrfs_release_path(chunk_root, path);
+ btrfs_release_path(path);
ret = btrfs_relocate_chunk(chunk_root,
chunk_root->root_key.objectid,
found_key.objectid,
@@ -2171,7 +2169,7 @@ again:
goto done;
if (ret) {
ret = 0;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
break;
}
@@ -2180,7 +2178,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
break;
}
@@ -2188,14 +2186,14 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
break;
}
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
chunk_offset);
@@ -2271,275 +2269,204 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
return 0;
}
-static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
- int num_stripes, int sub_stripes)
+/*
+ * sort the devices in descending order by max_avail, total_avail
+ */
+static int btrfs_cmp_device_info(const void *a, const void *b)
{
- if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
- return calc_size;
- else if (type & BTRFS_BLOCK_GROUP_RAID10)
- return calc_size * (num_stripes / sub_stripes);
- else
- return calc_size * num_stripes;
-}
+ const struct btrfs_device_info *di_a = a;
+ const struct btrfs_device_info *di_b = b;
-/* Used to sort the devices by max_avail(descending sort) */
-int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
-{
- if (((struct btrfs_device_info *)dev_info1)->max_avail >
- ((struct btrfs_device_info *)dev_info2)->max_avail)
+ if (di_a->max_avail > di_b->max_avail)
return -1;
- else if (((struct btrfs_device_info *)dev_info1)->max_avail <
- ((struct btrfs_device_info *)dev_info2)->max_avail)
+ if (di_a->max_avail < di_b->max_avail)
return 1;
- else
- return 0;
+ if (di_a->total_avail > di_b->total_avail)
+ return -1;
+ if (di_a->total_avail < di_b->total_avail)
+ return 1;
+ return 0;
}
-static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
- int *num_stripes, int *min_stripes,
- int *sub_stripes)
+static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *extent_root,
+ struct map_lookup **map_ret,
+ u64 *num_bytes_out, u64 *stripe_size_out,
+ u64 start, u64 type)
{
- *num_stripes = 1;
- *min_stripes = 1;
- *sub_stripes = 0;
+ struct btrfs_fs_info *info = extent_root->fs_info;
+ struct btrfs_fs_devices *fs_devices = info->fs_devices;
+ struct list_head *cur;
+ struct map_lookup *map = NULL;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+ struct btrfs_device_info *devices_info = NULL;
+ u64 total_avail;
+ int num_stripes; /* total number of stripes to allocate */
+ int sub_stripes; /* sub_stripes info for map */
+ int dev_stripes; /* stripes per dev */
+ int devs_max; /* max devs to use */
+ int devs_min; /* min devs needed */
+ int devs_increment; /* ndevs has to be a multiple of this */
+ int ncopies; /* how many copies to data has */
+ int ret;
+ u64 max_stripe_size;
+ u64 max_chunk_size;
+ u64 stripe_size;
+ u64 num_bytes;
+ int ndevs;
+ int i;
+ int j;
- if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
- *num_stripes = fs_devices->rw_devices;
- *min_stripes = 2;
- }
- if (type & (BTRFS_BLOCK_GROUP_DUP)) {
- *num_stripes = 2;
- *min_stripes = 2;
- }
- if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
- if (fs_devices->rw_devices < 2)
- return -ENOSPC;
- *num_stripes = 2;
- *min_stripes = 2;
- }
- if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
- *num_stripes = fs_devices->rw_devices;
- if (*num_stripes < 4)
- return -ENOSPC;
- *num_stripes &= ~(u32)1;
- *sub_stripes = 2;
- *min_stripes = 4;
+ if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
+ (type & BTRFS_BLOCK_GROUP_DUP)) {
+ WARN_ON(1);
+ type &= ~BTRFS_BLOCK_GROUP_DUP;
}
- return 0;
-}
+ if (list_empty(&fs_devices->alloc_list))
+ return -ENOSPC;
-static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
- u64 proposed_size, u64 type,
- int num_stripes, int small_stripe)
-{
- int min_stripe_size = 1 * 1024 * 1024;
- u64 calc_size = proposed_size;
- u64 max_chunk_size = calc_size;
- int ncopies = 1;
+ sub_stripes = 1;
+ dev_stripes = 1;
+ devs_increment = 1;
+ ncopies = 1;
+ devs_max = 0; /* 0 == as many as possible */
+ devs_min = 1;
- if (type & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID10))
+ /*
+ * define the properties of each RAID type.
+ * FIXME: move this to a global table and use it in all RAID
+ * calculation code
+ */
+ if (type & (BTRFS_BLOCK_GROUP_DUP)) {
+ dev_stripes = 2;
+ ncopies = 2;
+ devs_max = 1;
+ } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+ devs_min = 2;
+ } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+ devs_increment = 2;
+ ncopies = 2;
+ devs_max = 2;
+ devs_min = 2;
+ } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+ sub_stripes = 2;
+ devs_increment = 2;
ncopies = 2;
+ devs_min = 4;
+ } else {
+ devs_max = 1;
+ }
if (type & BTRFS_BLOCK_GROUP_DATA) {
- max_chunk_size = 10 * calc_size;
- min_stripe_size = 64 * 1024 * 1024;
+ max_stripe_size = 1024 * 1024 * 1024;
+ max_chunk_size = 10 * max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
- max_chunk_size = 256 * 1024 * 1024;
- min_stripe_size = 32 * 1024 * 1024;
+ max_stripe_size = 256 * 1024 * 1024;
+ max_chunk_size = max_stripe_size;
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
- calc_size = 8 * 1024 * 1024;
- max_chunk_size = calc_size * 2;
- min_stripe_size = 1 * 1024 * 1024;
+ max_stripe_size = 8 * 1024 * 1024;
+ max_chunk_size = 2 * max_stripe_size;
+ } else {
+ printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
+ type);
+ BUG_ON(1);
}
/* we don't want a chunk larger than 10% of writeable space */
max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
max_chunk_size);
- if (calc_size * num_stripes > max_chunk_size * ncopies) {
- calc_size = max_chunk_size * ncopies;
- do_div(calc_size, num_stripes);
- do_div(calc_size, BTRFS_STRIPE_LEN);
- calc_size *= BTRFS_STRIPE_LEN;
- }
+ devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+ GFP_NOFS);
+ if (!devices_info)
+ return -ENOMEM;
- /* we don't want tiny stripes */
- if (!small_stripe)
- calc_size = max_t(u64, min_stripe_size, calc_size);
+ cur = fs_devices->alloc_list.next;
/*
- * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
- * we end up with something bigger than a stripe
+ * in the first pass through the devices list, we gather information
+ * about the available holes on each device.
*/
- calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
+ ndevs = 0;
+ while (cur != &fs_devices->alloc_list) {
+ struct btrfs_device *device;
+ u64 max_avail;
+ u64 dev_offset;
- do_div(calc_size, BTRFS_STRIPE_LEN);
- calc_size *= BTRFS_STRIPE_LEN;
-
- return calc_size;
-}
-
-static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
- int num_stripes)
-{
- struct map_lookup *new;
- size_t len = map_lookup_size(num_stripes);
-
- BUG_ON(map->num_stripes < num_stripes);
-
- if (map->num_stripes == num_stripes)
- return map;
-
- new = kmalloc(len, GFP_NOFS);
- if (!new) {
- /* just change map->num_stripes */
- map->num_stripes = num_stripes;
- return map;
- }
-
- memcpy(new, map, len);
- new->num_stripes = num_stripes;
- kfree(map);
- return new;
-}
-
-/*
- * helper to allocate device space from btrfs_device_info, in which we stored
- * max free space information of every device. It is used when we can not
- * allocate chunks by default size.
- *
- * By this helper, we can allocate a new chunk as larger as possible.
- */
-static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
- struct btrfs_fs_devices *fs_devices,
- struct btrfs_device_info *devices,
- int nr_device, u64 type,
- struct map_lookup **map_lookup,
- int min_stripes, u64 *stripe_size)
-{
- int i, index, sort_again = 0;
- int min_devices = min_stripes;
- u64 max_avail, min_free;
- struct map_lookup *map = *map_lookup;
- int ret;
+ device = list_entry(cur, struct btrfs_device, dev_alloc_list);
- if (nr_device < min_stripes)
- return -ENOSPC;
+ cur = cur->next;
- btrfs_descending_sort_devices(devices, nr_device);
+ if (!device->writeable) {
+ printk(KERN_ERR
+ "btrfs: read-only device in alloc_list\n");
+ WARN_ON(1);
+ continue;
+ }
- max_avail = devices[0].max_avail;
- if (!max_avail)
- return -ENOSPC;
+ if (!device->in_fs_metadata)
+ continue;
- for (i = 0; i < nr_device; i++) {
- /*
- * if dev_offset = 0, it means the free space of this device
- * is less than what we need, and we didn't search max avail
- * extent on this device, so do it now.
+ if (device->total_bytes > device->bytes_used)
+ total_avail = device->total_bytes - device->bytes_used;
+ else
+ total_avail = 0;
+ /* avail is off by max(alloc_start, 1MB), but that is the same
+ * for all devices, so it doesn't hurt the sorting later on
*/
- if (!devices[i].dev_offset) {
- ret = find_free_dev_extent(trans, devices[i].dev,
- max_avail,
- &devices[i].dev_offset,
- &devices[i].max_avail);
- if (ret != 0 && ret != -ENOSPC)
- return ret;
- sort_again = 1;
- }
- }
- /* we update the max avail free extent of each devices, sort again */
- if (sort_again)
- btrfs_descending_sort_devices(devices, nr_device);
-
- if (type & BTRFS_BLOCK_GROUP_DUP)
- min_devices = 1;
+ ret = find_free_dev_extent(trans, device,
+ max_stripe_size * dev_stripes,
+ &dev_offset, &max_avail);
+ if (ret && ret != -ENOSPC)
+ goto error;
- if (!devices[min_devices - 1].max_avail)
- return -ENOSPC;
+ if (ret == 0)
+ max_avail = max_stripe_size * dev_stripes;
- max_avail = devices[min_devices - 1].max_avail;
- if (type & BTRFS_BLOCK_GROUP_DUP)
- do_div(max_avail, 2);
+ if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
+ continue;
- max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
- min_stripes, 1);
- if (type & BTRFS_BLOCK_GROUP_DUP)
- min_free = max_avail * 2;
- else
- min_free = max_avail;
+ devices_info[ndevs].dev_offset = dev_offset;
+ devices_info[ndevs].max_avail = max_avail;
+ devices_info[ndevs].total_avail = total_avail;
+ devices_info[ndevs].dev = device;
+ ++ndevs;
+ }
- if (min_free > devices[min_devices - 1].max_avail)
- return -ENOSPC;
+ /*
+ * now sort the devices by hole size / available space
+ */
+ sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+ btrfs_cmp_device_info, NULL);
- map = __shrink_map_lookup_stripes(map, min_stripes);
- *stripe_size = max_avail;
+ /* round down to number of usable stripes */
+ ndevs -= ndevs % devs_increment;
- index = 0;
- for (i = 0; i < min_stripes; i++) {
- map->stripes[i].dev = devices[index].dev;
- map->stripes[i].physical = devices[index].dev_offset;
- if (type & BTRFS_BLOCK_GROUP_DUP) {
- i++;
- map->stripes[i].dev = devices[index].dev;
- map->stripes[i].physical = devices[index].dev_offset +
- max_avail;
- }
- index++;
+ if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
+ ret = -ENOSPC;
+ goto error;
}
- *map_lookup = map;
-
- return 0;
-}
-static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct map_lookup **map_ret,
- u64 *num_bytes, u64 *stripe_size,
- u64 start, u64 type)
-{
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = info->fs_devices;
- struct list_head *cur;
- struct map_lookup *map;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
- struct btrfs_device_info *devices_info;
- struct list_head private_devs;
- u64 calc_size = 1024 * 1024 * 1024;
- u64 min_free;
- u64 avail;
- u64 dev_offset;
- int num_stripes;
- int min_stripes;
- int sub_stripes;
- int min_devices; /* the min number of devices we need */
- int i;
- int ret;
- int index;
+ if (devs_max && ndevs > devs_max)
+ ndevs = devs_max;
+ /*
+ * the primary goal is to maximize the number of stripes, so use as many
+ * devices as possible, even if the stripes are not maximum sized.
+ */
+ stripe_size = devices_info[ndevs-1].max_avail;
+ num_stripes = ndevs * dev_stripes;
- if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
- (type & BTRFS_BLOCK_GROUP_DUP)) {
- WARN_ON(1);
- type &= ~BTRFS_BLOCK_GROUP_DUP;
+ if (stripe_size * num_stripes > max_chunk_size * ncopies) {
+ stripe_size = max_chunk_size * ncopies;
+ do_div(stripe_size, num_stripes);
}
- if (list_empty(&fs_devices->alloc_list))
- return -ENOSPC;
- ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
- &min_stripes, &sub_stripes);
- if (ret)
- return ret;
-
- devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
- GFP_NOFS);
- if (!devices_info)
- return -ENOMEM;
+ do_div(stripe_size, dev_stripes);
+ do_div(stripe_size, BTRFS_STRIPE_LEN);
+ stripe_size *= BTRFS_STRIPE_LEN;
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
if (!map) {
@@ -2548,85 +2475,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
}
map->num_stripes = num_stripes;
- cur = fs_devices->alloc_list.next;
- index = 0;
- i = 0;
-
- calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
- num_stripes, 0);
-
- if (type & BTRFS_BLOCK_GROUP_DUP) {
- min_free = calc_size * 2;
- min_devices = 1;
- } else {
- min_free = calc_size;
- min_devices = min_stripes;
- }
-
- INIT_LIST_HEAD(&private_devs);
- while (index < num_stripes) {
- device = list_entry(cur, struct btrfs_device, dev_alloc_list);
- BUG_ON(!device->writeable);
- if (device->total_bytes > device->bytes_used)
- avail = device->total_bytes - device->bytes_used;
- else
- avail = 0;
- cur = cur->next;
-
- if (device->in_fs_metadata && avail >= min_free) {
- ret = find_free_dev_extent(trans, device, min_free,
- &devices_info[i].dev_offset,
- &devices_info[i].max_avail);
- if (ret == 0) {
- list_move_tail(&device->dev_alloc_list,
- &private_devs);
- map->stripes[index].dev = device;
- map->stripes[index].physical =
- devices_info[i].dev_offset;
- index++;
- if (type & BTRFS_BLOCK_GROUP_DUP) {
- map->stripes[index].dev = device;
- map->stripes[index].physical =
- devices_info[i].dev_offset +
- calc_size;
- index++;
- }
- } else if (ret != -ENOSPC)
- goto error;
-
- devices_info[i].dev = device;
- i++;
- } else if (device->in_fs_metadata &&
- avail >= BTRFS_STRIPE_LEN) {
- devices_info[i].dev = device;
- devices_info[i].max_avail = avail;
- i++;
- }
-
- if (cur == &fs_devices->alloc_list)
- break;
- }
-
- list_splice(&private_devs, &fs_devices->alloc_list);
- if (index < num_stripes) {
- if (index >= min_stripes) {
- num_stripes = index;
- if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
- num_stripes /= sub_stripes;
- num_stripes *= sub_stripes;
- }
-
- map = __shrink_map_lookup_stripes(map, num_stripes);
- } else if (i >= min_devices) {
- ret = __btrfs_alloc_tiny_space(trans, fs_devices,
- devices_info, i, type,
- &map, min_stripes,
- &calc_size);
- if (ret)
- goto error;
- } else {
- ret = -ENOSPC;
- goto error;
+ for (i = 0; i < ndevs; ++i) {
+ for (j = 0; j < dev_stripes; ++j) {
+ int s = i * dev_stripes + j;
+ map->stripes[s].dev = devices_info[i].dev;
+ map->stripes[s].physical = devices_info[i].dev_offset +
+ j * stripe_size;
}
}
map->sector_size = extent_root->sectorsize;
@@ -2637,18 +2491,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
map->sub_stripes = sub_stripes;
*map_ret = map;
- *stripe_size = calc_size;
- *num_bytes = chunk_bytes_by_type(type, calc_size,
- map->num_stripes, sub_stripes);
+ num_bytes = stripe_size * (num_stripes / ncopies);
+
+ *stripe_size_out = stripe_size;
+ *num_bytes_out = num_bytes;
+
+ trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
- em = alloc_extent_map(GFP_NOFS);
+ em = alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto error;
}
em->bdev = (struct block_device *)map;
em->start = start;
- em->len = *num_bytes;
+ em->len = num_bytes;
em->block_start = 0;
em->block_len = em->len;
@@ -2661,20 +2518,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
ret = btrfs_make_block_group(trans, extent_root, 0, type,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- start, *num_bytes);
+ start, num_bytes);
BUG_ON(ret);
- index = 0;
- while (index < map->num_stripes) {
- device = map->stripes[index].dev;
- dev_offset = map->stripes[index].physical;
+ for (i = 0; i < map->num_stripes; ++i) {
+ struct btrfs_device *device;
+ u64 dev_offset;
+
+ device = map->stripes[i].dev;
+ dev_offset = map->stripes[i].physical;
ret = btrfs_alloc_dev_extent(trans, device,
info->chunk_root->root_key.objectid,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- start, dev_offset, calc_size);
+ start, dev_offset, stripe_size);
BUG_ON(ret);
- index++;
}
kfree(devices_info);
@@ -2749,6 +2607,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
item_size);
BUG_ON(ret);
}
+
kfree(chunk);
return 0;
}
@@ -2880,7 +2739,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
{
- extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+ extent_map_tree_init(&tree->map_tree);
}
void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
@@ -2946,14 +2805,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
struct btrfs_multi_bio **multi_ret,
- int mirror_num, struct page *unplug_page)
+ int mirror_num)
{
struct extent_map *em;
struct map_lookup *map;
struct extent_map_tree *em_tree = &map_tree->map_tree;
u64 offset;
u64 stripe_offset;
+ u64 stripe_end_offset;
u64 stripe_nr;
+ u64 stripe_nr_orig;
+ u64 stripe_nr_end;
int stripes_allocated = 8;
int stripes_required = 1;
int stripe_index;
@@ -2962,7 +2824,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
int max_errors = 0;
struct btrfs_multi_bio *multi = NULL;
- if (multi_ret && !(rw & REQ_WRITE))
+ if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
stripes_allocated = 1;
again:
if (multi_ret) {
@@ -2978,11 +2840,6 @@ again:
em = lookup_extent_mapping(em_tree, logical, *length);
read_unlock(&em_tree->lock);
- if (!em && unplug_page) {
- kfree(multi);
- return 0;
- }
-
if (!em) {
printk(KERN_CRIT "unable to find logical %llu len %llu\n",
(unsigned long long)logical,
@@ -3008,7 +2865,15 @@ again:
max_errors = 1;
}
}
- if (multi_ret && (rw & REQ_WRITE) &&
+ if (rw & REQ_DISCARD) {
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID10)) {
+ stripes_required = map->num_stripes;
+ }
+ }
+ if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
stripes_allocated < stripes_required) {
stripes_allocated = map->num_stripes;
free_extent_map(em);
@@ -3028,23 +2893,37 @@ again:
/* stripe_offset is the offset of this block in its stripe*/
stripe_offset = offset - stripe_offset;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_DUP)) {
+ if (rw & REQ_DISCARD)
+ *length = min_t(u64, em->len - offset, *length);
+ else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_DUP)) {
/* we limit the length of each bio to what fits in a stripe */
*length = min_t(u64, em->len - offset,
- map->stripe_len - stripe_offset);
+ map->stripe_len - stripe_offset);
} else {
*length = em->len - offset;
}
- if (!multi_ret && !unplug_page)
+ if (!multi_ret)
goto out;
num_stripes = 1;
stripe_index = 0;
- if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
- if (unplug_page || (rw & REQ_WRITE))
+ stripe_nr_orig = stripe_nr;
+ stripe_nr_end = (offset + *length + map->stripe_len - 1) &
+ (~(map->stripe_len - 1));
+ do_div(stripe_nr_end, map->stripe_len);
+ stripe_end_offset = stripe_nr_end * map->stripe_len -
+ (offset + *length);
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ if (rw & REQ_DISCARD)
+ num_stripes = min_t(u64, map->num_stripes,
+ stripe_nr_end - stripe_nr_orig);
+ stripe_index = do_div(stripe_nr, map->num_stripes);
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+ if (rw & (REQ_WRITE | REQ_DISCARD))
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -3055,7 +2934,7 @@ again:
}
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (rw & REQ_WRITE)
+ if (rw & (REQ_WRITE | REQ_DISCARD))
num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
@@ -3066,8 +2945,12 @@ again:
stripe_index = do_div(stripe_nr, factor);
stripe_index *= map->sub_stripes;
- if (unplug_page || (rw & REQ_WRITE))
+ if (rw & REQ_WRITE)
num_stripes = map->sub_stripes;
+ else if (rw & REQ_DISCARD)
+ num_stripes = min_t(u64, map->sub_stripes *
+ (stripe_nr_end - stripe_nr_orig),
+ map->num_stripes);
else if (mirror_num)
stripe_index += mirror_num - 1;
else {
@@ -3085,24 +2968,101 @@ again:
}
BUG_ON(stripe_index >= map->num_stripes);
- for (i = 0; i < num_stripes; i++) {
- if (unplug_page) {
- struct btrfs_device *device;
- struct backing_dev_info *bdi;
-
- device = map->stripes[stripe_index].dev;
- if (device->bdev) {
- bdi = blk_get_backing_dev_info(device->bdev);
- if (bdi->unplug_io_fn)
- bdi->unplug_io_fn(bdi, unplug_page);
- }
- } else {
+ if (rw & REQ_DISCARD) {
+ for (i = 0; i < num_stripes; i++) {
multi->stripes[i].physical =
map->stripes[stripe_index].physical +
stripe_offset + stripe_nr * map->stripe_len;
multi->stripes[i].dev = map->stripes[stripe_index].dev;
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ u64 stripes;
+ u32 last_stripe = 0;
+ int j;
+
+ div_u64_rem(stripe_nr_end - 1,
+ map->num_stripes,
+ &last_stripe);
+
+ for (j = 0; j < map->num_stripes; j++) {
+ u32 test;
+
+ div_u64_rem(stripe_nr_end - 1 - j,
+ map->num_stripes, &test);
+ if (test == stripe_index)
+ break;
+ }
+ stripes = stripe_nr_end - 1 - j;
+ do_div(stripes, map->num_stripes);
+ multi->stripes[i].length = map->stripe_len *
+ (stripes - stripe_nr + 1);
+
+ if (i == 0) {
+ multi->stripes[i].length -=
+ stripe_offset;
+ stripe_offset = 0;
+ }
+ if (stripe_index == last_stripe)
+ multi->stripes[i].length -=
+ stripe_end_offset;
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ u64 stripes;
+ int j;
+ int factor = map->num_stripes /
+ map->sub_stripes;
+ u32 last_stripe = 0;
+
+ div_u64_rem(stripe_nr_end - 1,
+ factor, &last_stripe);
+ last_stripe *= map->sub_stripes;
+
+ for (j = 0; j < factor; j++) {
+ u32 test;
+
+ div_u64_rem(stripe_nr_end - 1 - j,
+ factor, &test);
+
+ if (test ==
+ stripe_index / map->sub_stripes)
+ break;
+ }
+ stripes = stripe_nr_end - 1 - j;
+ do_div(stripes, factor);
+ multi->stripes[i].length = map->stripe_len *
+ (stripes - stripe_nr + 1);
+
+ if (i < map->sub_stripes) {
+ multi->stripes[i].length -=
+ stripe_offset;
+ if (i == map->sub_stripes - 1)
+ stripe_offset = 0;
+ }
+ if (stripe_index >= last_stripe &&
+ stripe_index <= (last_stripe +
+ map->sub_stripes - 1)) {
+ multi->stripes[i].length -=
+ stripe_end_offset;
+ }
+ } else
+ multi->stripes[i].length = *length;
+
+ stripe_index++;
+ if (stripe_index == map->num_stripes) {
+ /* This could only happen for RAID0/10 */
+ stripe_index = 0;
+ stripe_nr++;
+ }
+ }
+ } else {
+ for (i = 0; i < num_stripes; i++) {
+ multi->stripes[i].physical =
+ map->stripes[stripe_index].physical +
+ stripe_offset +
+ stripe_nr * map->stripe_len;
+ multi->stripes[i].dev =
+ map->stripes[stripe_index].dev;
+ stripe_index++;
}
- stripe_index++;
}
if (multi_ret) {
*multi_ret = multi;
@@ -3119,7 +3079,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
struct btrfs_multi_bio **multi_ret, int mirror_num)
{
return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
- mirror_num, NULL);
+ mirror_num);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3187,14 +3147,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
- u64 logical, struct page *page)
-{
- u64 length = PAGE_CACHE_SIZE;
- return __btrfs_map_block(map_tree, READ, logical, &length,
- NULL, 0, page);
-}
-
static void end_bio_multi_stripe(struct bio *bio, int err)
{
struct btrfs_multi_bio *multi = bio->bi_private;
@@ -3437,7 +3389,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
free_extent_map(em);
}
- em = alloc_extent_map(GFP_NOFS);
+ em = alloc_extent_map();
if (!em)
return -ENOMEM;
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3626,15 +3578,6 @@ static int read_one_dev(struct btrfs_root *root,
return ret;
}
-int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
-{
- struct btrfs_dev_item *dev_item;
-
- dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
- dev_item);
- return read_one_dev(root, buf, dev_item);
-}
-
int btrfs_read_sys_array(struct btrfs_root *root)
{
struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -3751,7 +3694,7 @@ again:
}
if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
key.objectid = 0;
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
goto again;
}
ret = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7fb59d45fe8c..7c12d61ae7ae 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -85,7 +85,12 @@ struct btrfs_device {
/* physical drive uuid (or lvm uuid) */
u8 uuid[BTRFS_UUID_SIZE];
+ /* per-device scrub information */
+ struct scrub_dev *scrub_device;
+
struct btrfs_work work;
+ struct rcu_head rcu;
+ struct work_struct rcu_work;
};
struct btrfs_fs_devices {
@@ -126,6 +131,7 @@ struct btrfs_fs_devices {
struct btrfs_bio_stripe {
struct btrfs_device *dev;
u64 physical;
+ u64 length; /* only used for discard mappings */
};
struct btrfs_multi_bio {
@@ -143,22 +149,22 @@ struct btrfs_device_info {
struct btrfs_device *dev;
u64 dev_offset;
u64 max_avail;
+ u64 total_avail;
};
-/* Used to sort the devices by max_avail(descending sort) */
-int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
+struct map_lookup {
+ u64 type;
+ int io_align;
+ int io_width;
+ int stripe_len;
+ int sector_size;
+ int num_stripes;
+ int sub_stripes;
+ struct btrfs_bio_stripe stripes[];
+};
-/*
- * sort the devices by max_avail, in which max free extent size of each device
- * is stored.(Descending Sort)
- */
-static inline void btrfs_descending_sort_devices(
- struct btrfs_device_info *devices,
- size_t nr_devices)
-{
- sort(devices, nr_devices, sizeof(struct btrfs_device_info),
- btrfs_cmp_device_free_bytes, NULL);
-}
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+ (sizeof(struct btrfs_bio_stripe) * (n)))
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
@@ -184,7 +190,6 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
int mirror_num, int async_submit);
-int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
@@ -197,8 +202,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
int btrfs_rm_device(struct btrfs_root *root, char *device_path);
int btrfs_cleanup_fs_uuids(void);
int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
- u64 logical, struct page *page);
int btrfs_grow_device(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 new_size);
struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
@@ -206,8 +209,6 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
int btrfs_init_new_device(struct btrfs_root *root, char *path);
int btrfs_balance(struct btrfs_root *dev_root);
-void btrfs_unlock_volumes(void);
-void btrfs_lock_volumes(void);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a5776531dc2b..f3107e4b4d56 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -44,7 +44,7 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
return -ENOMEM;
/* lookup the xattr by name */
- di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name,
strlen(name), 0);
if (!di) {
ret = -ENODATA;
@@ -103,7 +103,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
return -ENOMEM;
/* first lets see if we already have this xattr */
- di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+ di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
strlen(name), -1);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -120,13 +120,13 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
ret = btrfs_delete_one_dir_name(trans, root, path, di);
BUG_ON(ret);
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
/* if we don't have a value then we are removing the xattr */
if (!value)
goto out;
} else {
- btrfs_release_path(root, path);
+ btrfs_release_path(path);
if (flags & XATTR_REPLACE) {
/* we couldn't find the attr to replace */
@@ -136,7 +136,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
}
/* ok we have to create a completely new xattr */
- ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
+ ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
name, name_len, value, size);
BUG_ON(ret);
out:
@@ -180,18 +180,17 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
- int ret = 0, slot, advance;
+ int ret = 0, slot;
size_t total_size = 0, size_left = size;
unsigned long name_ptr;
size_t name_len;
- u32 nritems;
/*
* ok we want all objects associated with this id.
* NOTE: we set key.offset = 0; because we want to start with the
* first xattr that we find and walk forward
*/
- key.objectid = inode->i_ino;
+ key.objectid = btrfs_ino(inode);
btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
key.offset = 0;
@@ -204,34 +203,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto err;
- advance = 0;
+
while (1) {
leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
slot = path->slots[0];
/* this is where we start walking through the path */
- if (advance || slot >= nritems) {
+ if (slot >= btrfs_header_nritems(leaf)) {
/*
* if we've reached the last slot in this leaf we need
* to go to the next leaf and reset everything
*/
- if (slot >= nritems-1) {
- ret = btrfs_next_leaf(root, path);
- if (ret)
- break;
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- slot = path->slots[0];
- } else {
- /*
- * just walking through the slots on this leaf
- */
- slot++;
- path->slots[0]++;
- }
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto err;
+ else if (ret > 0)
+ break;
+ continue;
}
- advance = 1;
btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -242,13 +231,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
break;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ if (verify_dir_item(root, leaf, di))
+ continue;
name_len = btrfs_dir_name_len(leaf, di);
total_size += name_len + 1;
/* we are just looking for how big our buffer needs to be */
if (!size)
- continue;
+ goto next;
if (!buffer || (name_len + 1) > size_left) {
ret = -ERANGE;
@@ -261,6 +252,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
size_left -= name_len + 1;
buffer += name_len + 1;
+next:
+ path->slots[0]++;
}
ret = total_size;
@@ -370,7 +363,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
}
int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir)
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr)
{
int err;
size_t len;
@@ -378,7 +372,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
char *suffix;
char *name;
- err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+ err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+ &len);
if (err) {
if (err == -EOPNOTSUPP)
return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bbb..b3cc8039134b 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
extern int btrfs_removexattr(struct dentry *dentry, const char *name);
extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
- struct inode *inode, struct inode *dir);
+ struct inode *inode, struct inode *dir,
+ const struct qstr *qstr);
#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index f5ec2d44150d..faccd47c6c46 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -57,7 +57,8 @@ static struct list_head *zlib_alloc_workspace(void)
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
+ workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
+ MAX_WBITS, MAX_MEM_LEVEL));
workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
if (!workspace->def_strm.workspace ||