summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/affs/namei.c2
-rw-r--r--fs/aio.c65
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/dev-ioctl.c13
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c109
-rw-r--r--fs/btrfs/ctree.h163
-rw-r--r--fs/btrfs/delayed-ref.c101
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c169
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2255
-rw-r--r--fs/btrfs/extent_io.c85
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/file-item.c28
-rw-r--r--fs/btrfs/file.c166
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1713
-rw-r--r--fs/btrfs/ioctl.c206
-rw-r--r--fs/btrfs/ordered-data.c82
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/relocation.c1971
-rw-r--r--fs/btrfs/root-tree.c23
-rw-r--r--fs/btrfs/super.c30
-rw-r--r--fs/btrfs/transaction.c232
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c241
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c17
-rw-r--r--fs/btrfs/xattr.c12
-rw-r--r--fs/compat.c132
-rw-r--r--fs/direct-io.c62
-rw-r--r--fs/exec.c195
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c26
-rw-r--r--fs/ext4/ext4.h167
-rw-r--r--fs/ext4/ext4_jbd2.h8
-rw-r--r--fs/ext4/extents.c417
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c35
-rw-r--r--fs/ext4/ialloc.c89
-rw-r--r--fs/ext4/inode.c723
-rw-r--r--fs/ext4/ioctl.c27
-rw-r--r--fs/ext4/mballoc.c120
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c13
-rw-r--r--fs/ext4/namei.c61
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c80
-rw-r--r--fs/ext4/symlink.c2
-rw-r--r--fs/ext4/xattr.c39
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/ncpfs/dir.c1
-rw-r--r--fs/nfs/dir.c1
-rw-r--r--fs/nfs/write.c20
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c16
-rw-r--r--fs/proc/generic.c15
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/qnx4/dir.c1
-rw-r--r--fs/quota/dquot.c11
-rw-r--r--fs/read_write.c17
-rw-r--r--fs/reiserfs/dir.c1
-rw-r--r--fs/smbfs/dir.c1
-rw-r--r--fs/squashfs/Kconfig11
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/inode.c92
-rw-r--r--fs/squashfs/namei.c6
-rw-r--r--fs/squashfs/squashfs.h12
-rw-r--r--fs/squashfs/squashfs_fs.h76
-rw-r--r--fs/squashfs/squashfs_fs_i.h3
-rw-r--r--fs/squashfs/squashfs_fs_sb.h3
-rw-r--r--fs/squashfs/super.c30
-rw-r--r--fs/squashfs/symlink.c11
-rw-r--r--fs/squashfs/xattr.c323
-rw-r--r--fs/squashfs/xattr.h46
-rw-r--r--fs/squashfs/xattr_id.c100
-rw-r--r--fs/udf/dir.c1
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/ufs/ufs_fs.h1
87 files changed, 7071 insertions, 3739 deletions
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
affs_brelse(bh);
inode = affs_iget(sb, ino);
if (IS_ERR(inode))
- return ERR_PTR(PTR_ERR(inode));
+ return ERR_CAST(inode);
}
dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
d_add(dentry, inode);
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..48fdeebdb544 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
#include <linux/blkdev.h>
#include <linux/mempool.h>
#include <linux/hash.h>
+#include <linux/compat.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
return ret;
}
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
{
ssize_t ret;
- ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
- kiocb->ki_nbytes, 1,
- &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+#ifdef CONFIG_COMPAT
+ if (compat)
+ ret = compat_rw_copy_check_uvector(type,
+ (struct compat_iovec __user *)kiocb->ki_buf,
+ kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ &kiocb->ki_iovec);
+ else
+#endif
+ ret = rw_copy_check_uvector(type,
+ (struct iovec __user *)kiocb->ki_buf,
+ kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+ &kiocb->ki_iovec);
if (ret < 0)
goto out;
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
* Performs the initial checks and aio retry method
* setup for the kiocb at the time of io submission.
*/
-static ssize_t aio_setup_iocb(struct kiocb *kiocb)
+static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
{
struct file *file = kiocb->ki_filp;
ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
ret = security_file_permission(file, MAY_READ);
if (unlikely(ret))
break;
- ret = aio_setup_vectored_rw(READ, kiocb);
+ ret = aio_setup_vectored_rw(READ, kiocb, compat);
if (ret)
break;
ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
ret = security_file_permission(file, MAY_WRITE);
if (unlikely(ret))
break;
- ret = aio_setup_vectored_rw(WRITE, kiocb);
+ ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
if (ret)
break;
ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
}
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
- struct iocb *iocb, struct hlist_head *batch_hash)
+ struct iocb *iocb, struct hlist_head *batch_hash,
+ bool compat)
{
struct kiocb *req;
struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
req->ki_opcode = iocb->aio_lio_opcode;
- ret = aio_setup_iocb(req);
+ ret = aio_setup_iocb(req, compat);
if (ret)
goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
return ret;
}
-/* sys_io_submit:
- * Queue the nr iocbs pointed to by iocbpp for processing. Returns
- * the number of iocbs queued. May return -EINVAL if the aio_context
- * specified by ctx_id is invalid, if nr is < 0, if the iocb at
- * *iocbpp[0] is not properly initialized, if the operation specified
- * is invalid for the file descriptor in the iocb. May fail with
- * -EFAULT if any of the data structures point to invalid data. May
- * fail with -EBADF if the file descriptor specified in the first
- * iocb is invalid. May fail with -EAGAIN if insufficient resources
- * are available to queue any iocbs. Will return 0 if nr is 0. Will
- * fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
- struct iocb __user * __user *, iocbpp)
+long do_io_submit(aio_context_t ctx_id, long nr,
+ struct iocb __user *__user *iocbpp, bool compat)
{
struct kioctx *ctx;
long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
break;
}
- ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
+ ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
if (ret)
break;
}
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
return i ? i : ret;
}
+/* sys_io_submit:
+ * Queue the nr iocbs pointed to by iocbpp for processing. Returns
+ * the number of iocbs queued. May return -EINVAL if the aio_context
+ * specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ * *iocbpp[0] is not properly initialized, if the operation specified
+ * is invalid for the file descriptor in the iocb. May fail with
+ * -EFAULT if any of the data structures point to invalid data. May
+ * fail with -EBADF if the file descriptor specified in the first
+ * iocb is invalid. May fail with -EAGAIN if insufficient resources
+ * are available to queue any iocbs. Will return 0 if nr is 0. Will
+ * fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+ struct iocb __user * __user *, iocbpp)
+{
+ return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
+
/* lookup_kiocb
* Finds a given iocb for cancellation.
*/
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
const struct file_operations autofs_root_operations = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = autofs_root_readdir,
.ioctl = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d832062869f6..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
*/
static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
{
- struct autofs_dev_ioctl tmp, *ads;
+ struct autofs_dev_ioctl tmp;
if (copy_from_user(&tmp, in, sizeof(tmp)))
return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
if (tmp.size < sizeof(tmp))
return ERR_PTR(-EINVAL);
- ads = kmalloc(tmp.size, GFP_KERNEL);
- if (!ads)
- return ERR_PTR(-ENOMEM);
-
- if (copy_from_user(ads, in, tmp.size)) {
- kfree(ads);
- return ERR_PTR(-EFAULT);
- }
-
- return ads;
+ return memdup_user(in, tmp.size);
}
static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending)) {
spin_unlock_irq(&worker->lock);
+ set_current_state(TASK_RUNNING);
goto again;
}
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
* of extent items we've reserved metadata for.
*/
spinlock_t accounting_lock;
+ atomic_t outstanding_extents;
int reserved_extents;
- int outstanding_extents;
/*
* ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
* of these.
*/
unsigned ordered_data_close:1;
+ unsigned orphan_meta_reserved:1;
unsigned dummy_inode:1;
/*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..0d1d966b0fe4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- struct extent_buffer *cow)
+ struct extent_buffer *cow,
+ int *last_ref)
{
u64 refs;
u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
clean_tree_block(trans, root, buf);
+ *last_ref = 1;
}
return 0;
}
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
struct extent_buffer *cow;
int level;
+ int last_ref = 0;
int unlock_orig = 0;
u64 parent_start;
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
(unsigned long)btrfs_header_fsid(cow),
BTRFS_FSID_SIZE);
- update_ref_for_cow(trans, root, buf, cow);
+ update_ref_for_cow(trans, root, buf, cow, &last_ref);
+
+ if (root->ref_cows)
+ btrfs_reloc_cow_block(trans, root, buf, cow);
if (buf == root->node) {
WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
extent_buffer_get(cow);
spin_unlock(&root->node_lock);
- btrfs_free_tree_block(trans, root, buf->start, buf->len,
- parent_start, root->root_key.objectid, level);
+ btrfs_free_tree_block(trans, root, buf, parent_start,
+ last_ref);
free_extent_buffer(buf);
add_root_to_dirty_list(root);
} else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
- btrfs_free_tree_block(trans, root, buf->start, buf->len,
- parent_start, root->root_key.objectid, level);
+ btrfs_free_tree_block(trans, root, buf, parent_start,
+ last_ref);
}
if (unlock_orig)
btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
return bin_search(eb, key, level, slot);
}
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) + size);
+ spin_unlock(&root->accounting_lock);
+}
+
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+ spin_lock(&root->accounting_lock);
+ btrfs_set_root_used(&root->root_item,
+ btrfs_root_used(&root->root_item) - size);
+ spin_unlock(&root->accounting_lock);
+}
+
/* given a node and slot number, this reads the blocks it points to. The
* extent buffer is returned with a reference taken (but unlocked).
* NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_tree_unlock(child);
+ free_extent_buffer(child);
+ goto enospc;
+ }
spin_lock(&root->node_lock);
root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(mid);
/* once for the path */
free_extent_buffer(mid);
- ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
- 0, root->root_key.objectid, level);
+
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
/* once for the root ptr */
free_extent_buffer(mid);
- return ret;
+ return 0;
}
if (btrfs_header_nritems(mid) >
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (btrfs_header_nritems(right) == 0) {
- u64 bytenr = right->start;
- u32 blocksize = right->len;
-
clean_tree_block(trans, root, right);
btrfs_tree_unlock(right);
- free_extent_buffer(right);
- right = NULL;
wret = del_ptr(trans, root, path, level + 1, pslot +
1);
if (wret)
ret = wret;
- wret = btrfs_free_tree_block(trans, root,
- bytenr, blocksize, 0,
- root->root_key.objectid,
- level);
- if (wret)
- ret = wret;
+ root_sub_used(root, right->len);
+ btrfs_free_tree_block(trans, root, right, 0, 1);
+ free_extent_buffer(right);
+ right = NULL;
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(wret == 1);
}
if (btrfs_header_nritems(mid) == 0) {
- /* we've managed to empty the middle node, drop it */
- u64 bytenr = mid->start;
- u32 blocksize = mid->len;
-
clean_tree_block(trans, root, mid);
btrfs_tree_unlock(mid);
- free_extent_buffer(mid);
- mid = NULL;
wret = del_ptr(trans, root, path, level + 1, pslot);
if (wret)
ret = wret;
- wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
- 0, root->root_key.objectid, level);
- if (wret)
- ret = wret;
+ root_sub_used(root, mid->len);
+ btrfs_free_tree_block(trans, root, mid, 0, 1);
+ free_extent_buffer(mid);
+ mid = NULL;
} else {
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
btrfs_release_path(NULL, p);
ret = -EAGAIN;
- tmp = read_tree_block(root, blocknr, blocksize, gen);
+ tmp = read_tree_block(root, blocknr, blocksize, 0);
if (tmp) {
/*
* If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
p->nodes[level + 1],
p->slots[level + 1], &b);
if (err) {
- free_extent_buffer(b);
ret = err;
goto done;
}
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (IS_ERR(c))
return PTR_ERR(c);
+ root_add_used(root, root->nodesize);
+
memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_nritems(c, 1);
btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
int nritems;
BUG_ON(!path->nodes[level]);
+ btrfs_assert_tree_locked(path->nodes[level]);
lower = path->nodes[level];
nritems = btrfs_header_nritems(lower);
BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
if (IS_ERR(split))
return PTR_ERR(split);
+ root_add_used(root, root->nodesize);
+
memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_level(split, btrfs_header_level(c));
btrfs_set_header_bytenr(split, split->start);
@@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
if (left_nritems)
btrfs_mark_buffer_dirty(left);
+ else
+ clean_tree_block(trans, root, left);
+
btrfs_mark_buffer_dirty(right);
btrfs_item_key(right, &disk_key, 0);
@@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(left);
if (right_nritems)
btrfs_mark_buffer_dirty(right);
+ else
+ clean_tree_block(trans, root, right);
btrfs_item_key(right, &disk_key, 0);
wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
/* then fixup the leaf pointer in the path */
if (path->slots[0] < push_items) {
path->slots[0] += old_left_nritems;
- if (btrfs_header_nritems(path->nodes[0]) == 0)
- clean_tree_block(trans, root, path->nodes[0]);
btrfs_tree_unlock(path->nodes[0]);
free_extent_buffer(path->nodes[0]);
path->nodes[0] = left;
@@ -2932,10 +2953,10 @@ again:
right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
root->root_key.objectid,
&disk_key, 0, l->start, 0);
- if (IS_ERR(right)) {
- BUG_ON(1);
+ if (IS_ERR(right))
return PTR_ERR(right);
- }
+
+ root_add_used(root, root->leafsize);
memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(right, right->start);
@@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &key, path, ins_len, 1);
- BUG_ON(ret);
+ if (ret)
+ goto err;
path->keep_locks = 0;
btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
*/
btrfs_unlock_up_safe(path, 0);
- ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
- 0, root->root_key.objectid, 0);
- return ret;
+ root_sub_used(root, leaf->len);
+
+ btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ return 0;
}
/*
* delete the item at the leaf level in path. If that empties
@@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (leaf == root->node) {
btrfs_set_header_level(leaf, 0);
} else {
+ btrfs_set_path_blocking(path);
+ clean_tree_block(trans, root, leaf);
ret = btrfs_del_leaf(trans, root, path, leaf);
BUG_ON(ret);
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..e9bf86415e86 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
struct btrfs_trans_handle;
struct btrfs_transaction;
+struct btrfs_pending_snapshot;
extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
+#define BTRFS_NR_RAID_TYPES 5
struct btrfs_block_group_item {
__le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
u64 flags;
u64 total_bytes; /* total bytes in the space */
- u64 bytes_used; /* total bytes used on disk */
+ u64 bytes_used; /* total bytes used,
+ this does't take mirrors into account */
u64 bytes_pinned; /* total bytes pinned, will be freed when the
transaction finishes */
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
- u64 bytes_super; /* total bytes reserved for the super blocks */
- u64 bytes_root; /* the number of bytes needed to commit a
- transaction */
+
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
- u64 bytes_delalloc; /* number of bytes currently reserved for
- delayed allocation */
+ u64 disk_used; /* total bytes used on disk */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
- int force_delalloc; /* make people start doing filemap_flush until
- we're under a threshold */
struct list_head list;
- /* for controlling how we free up space for allocations */
- wait_queue_head_t allocate_wait;
- wait_queue_head_t flush_wait;
- int allocating_chunk;
- int flushing;
-
/* for block groups in our same type */
- struct list_head block_groups;
+ struct list_head block_groups[BTRFS_NR_RAID_TYPES];
spinlock_t lock;
struct rw_semaphore groups_sem;
atomic_t caching_threads;
};
+struct btrfs_block_rsv {
+ u64 size;
+ u64 reserved;
+ u64 freed[2];
+ struct btrfs_space_info *space_info;
+ struct list_head list;
+ spinlock_t lock;
+ atomic_t usage;
+ unsigned int priority:8;
+ unsigned int durable:1;
+ unsigned int refill_used:1;
+ unsigned int full:1;
+};
+
/*
* free clusters are used to claim free space in relatively large chunks,
* allowing us to do less seeky writes. They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 reserved_pinned;
u64 bytes_super;
u64 flags;
u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
+ /* block reservation for extent, checksum and root tree */
+ struct btrfs_block_rsv global_block_rsv;
+ /* block reservation for delay allocation */
+ struct btrfs_block_rsv delalloc_block_rsv;
+ /* block reservation for metadata operations */
+ struct btrfs_block_rsv trans_block_rsv;
+ /* block reservation for chunk tree */
+ struct btrfs_block_rsv chunk_block_rsv;
+
+ struct btrfs_block_rsv empty_block_rsv;
+
+ /* list of block reservations that cross multiple transactions */
+ struct list_head durable_block_rsv_list;
+
+ struct mutex durable_block_rsv_mutex;
+
u64 generation;
u64 last_trans_committed;
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
struct btrfs_workers endio_meta_write_workers;
struct btrfs_workers endio_write_workers;
struct btrfs_workers submit_workers;
- struct btrfs_workers enospc_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
int do_barriers;
int closing;
int log_root_recovering;
+ int enospc_unlink;
u64 total_pinned;
@@ -1012,6 +1035,9 @@ struct btrfs_root {
struct completion kobj_unregister;
struct mutex objectid_mutex;
+ spinlock_t accounting_lock;
+ struct btrfs_block_rsv *block_rsv;
+
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
int ref_cows;
int track_dirty;
int in_radix;
- int clean_orphans;
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t list_lock;
+ spinlock_t orphan_lock;
struct list_head orphan_list;
+ struct btrfs_block_rsv *orphan_block_rsv;
+ int orphan_item_inserted;
+ int orphan_cleanup_state;
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
struct btrfs_disk_key *key, int level,
u64 hint, u64 empty_size);
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u32 blocksize,
- u64 parent, u64 root_objectid, int level);
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref);
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
- struct btrfs_block_group_cache *group);
-
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items);
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items);
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes);
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
- struct inode *inode, u64 bytes);
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes);
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int num_items, int *retries);
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct inode *inode);
+void btrfs_orphan_release_metadata(struct inode *inode);
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv);
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int *retries);
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved, int min_factor);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv,
+ u64 num_bytes);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
+int btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 *index);
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 logical_offset, u32 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
int btrfs_invalidate_inodes(struct btrfs_root *root);
void btrfs_add_delayed_iput(struct inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_recover_relocation(struct btrfs_root *root);
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve);
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending);
#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
}
/*
- * helper function to lookup reference count and flags of extent.
- *
- * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree. the head
- * node may also store the extent flags to set. This way you can check
- * to see what the reference count and extent flags would be if all of
- * the delayed refs are not processed.
- */
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *refs, u64 *flags)
-{
- struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_ref_head *head;
- struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct extent_buffer *leaf;
- struct btrfs_key key;
- u32 item_size;
- u64 num_refs;
- u64 extent_flags;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = num_bytes;
- delayed_refs = &trans->transaction->delayed_refs;
-again:
- ret = btrfs_search_slot(trans, root->fs_info->extent_root,
- &key, path, 0, 0);
- if (ret < 0)
- goto out;
-
- if (ret == 0) {
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- ei = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item);
- num_refs = btrfs_extent_refs(leaf, ei);
- extent_flags = btrfs_extent_flags(leaf, ei);
- } else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
- struct btrfs_extent_item_v0 *ei0;
- BUG_ON(item_size != sizeof(*ei0));
- ei0 = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_extent_item_v0);
- num_refs = btrfs_extent_refs_v0(leaf, ei0);
- /* FIXME: this isn't correct for data */
- extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-#else
- BUG();
-#endif
- }
- BUG_ON(num_refs == 0);
- } else {
- num_refs = 0;
- extent_flags = 0;
- ret = 0;
- }
-
- spin_lock(&delayed_refs->lock);
- ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
- if (ref) {
- head = btrfs_delayed_node_to_head(ref);
- if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&ref->refs);
- spin_unlock(&delayed_refs->lock);
-
- btrfs_release_path(root->fs_info->extent_root, path);
-
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(ref);
- goto again;
- }
- if (head->extent_op && head->extent_op->update_flags)
- extent_flags |= head->extent_op->flags_to_set;
- else
- BUG_ON(num_refs == 0);
-
- num_refs += ref->ref_mod;
- mutex_unlock(&head->mutex);
- }
- WARN_ON(num_refs == 0);
- if (refs)
- *refs = num_refs;
- if (flags)
- *flags = extent_flags;
-out:
- spin_unlock(&delayed_refs->lock);
- btrfs_free_path(path);
- return ret;
-}
-
-/*
* helper function to update an extent delayed ref in the
* rbtree. existing and update must both have the same
* bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u64 *refs, u64 *flags);
int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 orig_parent,
u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..f3b287c22caf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
int rw;
int mirror_num;
unsigned long bio_flags;
+ /*
+ * bio_offset is optional, can be used if the pages in the bio
+ * can't tell us where in the file the bio should go
+ */
+ u64 bio_offset;
struct btrfs_work work;
};
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
async = container_of(work, struct async_submit_bio, work);
fs_info = BTRFS_I(async->inode)->root->fs_info;
async->submit_bio_start(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags);
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
}
static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
wake_up(&fs_info->async_submit_wait);
async->submit_bio_done(async->inode, async->rw, async->bio,
- async->mirror_num, async->bio_flags);
+ async->mirror_num, async->bio_flags,
+ async->bio_offset);
}
static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
unsigned long bio_flags,
+ u64 bio_offset,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done)
{
@@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->work.flags = 0;
async->bio_flags = bio_flags;
+ async->bio_offset = bio_offset;
atomic_inc(&fs_info->nr_async_submits);
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
static int __btree_submit_bio_start(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+ unsigned long bio_flags,
+ u64 bio_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
}
static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
/*
* when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
}
static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
int ret;
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
*/
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num, 0,
+ bio_offset,
__btree_submit_bio_start,
__btree_submit_bio_done);
}
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->ref_cows = 0;
root->track_dirty = 0;
root->in_radix = 0;
- root->clean_orphans = 0;
+ root->orphan_item_inserted = 0;
+ root->orphan_cleanup_state = 0;
root->fs_info = fs_info;
root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree = RB_ROOT;
+ root->block_rsv = NULL;
+ root->orphan_block_rsv = NULL;
INIT_LIST_HEAD(&root->dirty_list);
INIT_LIST_HEAD(&root->orphan_list);
INIT_LIST_HEAD(&root->root_list);
spin_lock_init(&root->node_lock);
- spin_lock_init(&root->list_lock);
+ spin_lock_init(&root->orphan_lock);
spin_lock_init(&root->inode_lock);
+ spin_lock_init(&root->accounting_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
return 0;
}
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct extent_buffer *eb;
- struct btrfs_root *log_root_tree = fs_info->log_root_tree;
- u64 start = 0;
- u64 end = 0;
- int ret;
-
- if (!log_root_tree)
- return 0;
-
- while (1) {
- ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
- 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
- if (ret)
- break;
-
- clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
- EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
- }
- eb = fs_info->log_root_tree->node;
-
- WARN_ON(btrfs_header_level(eb) != 0);
- WARN_ON(btrfs_header_nritems(eb) != 0);
-
- ret = btrfs_free_reserved_extent(fs_info->tree_root,
- eb->start, eb->len);
- BUG_ON(ret);
-
- free_extent_buffer(eb);
- kfree(fs_info->log_root_tree);
- fs_info->log_root_tree = NULL;
- return 0;
-}
-
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@@ -1191,19 +1172,23 @@ again:
if (root)
return root;
- ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
- if (ret == 0)
- ret = -ENOENT;
- if (ret < 0)
- return ERR_PTR(ret);
-
root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
- WARN_ON(btrfs_root_refs(&root->root_item) == 0);
set_anon_super(&root->anon_super, NULL);
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+ if (ret < 0)
+ goto fail;
+ if (ret == 0)
+ root->orphan_item_inserted = 1;
+
ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret)
goto fail;
@@ -1212,10 +1197,9 @@ again:
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
- if (ret == 0) {
+ if (ret == 0)
root->in_radix = 1;
- root->clean_orphans = 1;
- }
+
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
if (ret) {
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
struct btrfs_root *root = arg;
do {
- smp_mb();
- if (root->fs_info->closing)
- break;
-
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
if (freezing(current)) {
refrigerator();
} else {
- smp_mb();
- if (root->fs_info->closing)
- break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ if (!kthread_should_stop())
+ schedule();
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
struct btrfs_root *root = arg;
struct btrfs_trans_handle *trans;
struct btrfs_transaction *cur;
+ u64 transid;
unsigned long now;
unsigned long delay;
int ret;
do {
- smp_mb();
- if (root->fs_info->closing)
- break;
-
delay = HZ * 30;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
- mutex_lock(&root->fs_info->trans_mutex);
+ spin_lock(&root->fs_info->new_trans_lock);
cur = root->fs_info->running_transaction;
if (!cur) {
- mutex_unlock(&root->fs_info->trans_mutex);
+ spin_unlock(&root->fs_info->new_trans_lock);
goto sleep;
}
now = get_seconds();
- if (now < cur->start_time || now - cur->start_time < 30) {
- mutex_unlock(&root->fs_info->trans_mutex);
+ if (!cur->blocked &&
+ (now < cur->start_time || now - cur->start_time < 30)) {
+ spin_unlock(&root->fs_info->new_trans_lock);
delay = HZ * 5;
goto sleep;
}
- mutex_unlock(&root->fs_info->trans_mutex);
- trans = btrfs_start_transaction(root, 1);
- ret = btrfs_commit_transaction(trans, root);
+ transid = cur->transid;
+ spin_unlock(&root->fs_info->new_trans_lock);
+ trans = btrfs_join_transaction(root, 1);
+ if (transid == trans->transid) {
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ } else {
+ btrfs_end_transaction(trans, root);
+ }
sleep:
wake_up_process(root->fs_info->cleaner_kthread);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
if (freezing(current)) {
refrigerator();
} else {
- if (root->fs_info->closing)
- break;
set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(delay);
+ if (!kthread_should_stop() &&
+ !btrfs_transaction_blocked(root->fs_info))
+ schedule_timeout(delay);
__set_current_state(TASK_RUNNING);
}
} while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info);
btrfs_mapping_init(&fs_info->mapping_tree);
+ btrfs_init_block_rsv(&fs_info->global_block_rsv);
+ btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+ btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+ btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+ btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+ INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+ mutex_init(&fs_info->durable_block_rsv_mutex);
atomic_set(&fs_info->nr_async_submits, 0);
atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
min_t(u64, fs_devices->num_devices,
fs_info->thread_pool_size),
&fs_info->generic_worker);
- btrfs_init_workers(&fs_info->enospc_workers, "enospc",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
/* a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_start_workers(&fs_info->endio_meta_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
- btrfs_start_workers(&fs_info->enospc_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
csum_root->track_dirty = 1;
+ fs_info->generation = generation;
+ fs_info->last_trans_committed = generation;
+ fs_info->data_alloc_profile = (u64)-1;
+ fs_info->metadata_alloc_profile = (u64)-1;
+ fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+
ret = btrfs_read_block_groups(extent_root);
if (ret) {
printk(KERN_ERR "Failed to read block groups: %d\n", ret);
goto fail_block_groups;
}
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
- fs_info->data_alloc_profile = (u64)-1;
- fs_info->metadata_alloc_profile = (u64)-1;
- fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
if (IS_ERR(fs_info->cleaner_kthread))
@@ -1977,6 +1963,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
BUG_ON(ret);
if (!(sb->s_flags & MS_RDONLY)) {
+ ret = btrfs_cleanup_fs_roots(fs_info);
+ BUG_ON(ret);
+
ret = btrfs_recover_relocation(tree_root);
if (ret < 0) {
printk(KERN_WARNING
@@ -2040,7 +2029,6 @@ fail_sb_buffer:
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->enospc_workers);
fail_iput:
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
iput(fs_info->btree_inode);
@@ -2405,11 +2393,11 @@ int btrfs_commit_super(struct btrfs_root *root)
down_write(&root->fs_info->cleanup_work_sem);
up_write(&root->fs_info->cleanup_work_sem);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_join_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
/* run commit again to drop the original snapshot */
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_join_transaction(root, 1);
btrfs_commit_transaction(trans, root);
ret = btrfs_write_and_wait_transaction(NULL, root);
BUG_ON(ret);
@@ -2426,15 +2414,15 @@ int close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
- kthread_stop(root->fs_info->transaction_kthread);
- kthread_stop(root->fs_info->cleaner_kthread);
-
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
+ kthread_stop(root->fs_info->transaction_kthread);
+ kthread_stop(root->fs_info->cleaner_kthread);
+
fs_info->closing = 2;
smp_mb();
@@ -2473,7 +2461,6 @@ int close_ctree(struct btrfs_root *root)
btrfs_stop_workers(&fs_info->endio_meta_write_workers);
btrfs_stop_workers(&fs_info->endio_write_workers);
btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->enospc_workers);
btrfs_close_devices(fs_info->fs_devices);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
int metadata);
int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
int rw, struct bio *bio, int mirror_num,
- unsigned long bio_flags,
+ unsigned long bio_flags, u64 bio_offset,
extent_submit_bio_hook_t *submit_bio_start,
extent_submit_bio_hook_t *submit_bio_done);
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
int btrfs_write_tree_block(struct extent_buffer *buf);
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6a4f459ad76..b9080d71991a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc,
- int mark_free);
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve);
+ u64 bytenr, u64 num_bytes, int alloc);
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve, int sinfo);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 bytenr, u64 num_bytes,
- int is_data, int reserved,
- struct extent_buffer **must_clean);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
{
- if (atomic_dec_and_test(&cache->count))
+ if (atomic_dec_and_test(&cache->count)) {
+ WARN_ON(cache->pinned > 0);
+ WARN_ON(cache->reserved > 0);
+ WARN_ON(cache->reserved_pinned > 0);
kfree(cache);
+ }
}
/*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
exclude_super_stripes(extent_root, block_group);
spin_lock(&block_group->space_info->lock);
- block_group->space_info->bytes_super += block_group->bytes_super;
+ block_group->space_info->bytes_readonly += block_group->bytes_super;
spin_unlock(&block_group->space_info->lock);
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
+ flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+ BTRFS_BLOCK_GROUP_METADATA;
+
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
}
/*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 *refs, u64 *flags)
+{
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_path *path;
+ struct btrfs_extent_item *ei;
+ struct extent_buffer *leaf;
+ struct btrfs_key key;
+ u32 item_size;
+ u64 num_refs;
+ u64 extent_flags;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ if (!trans) {
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+ }
+again:
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out_free;
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+ if (item_size >= sizeof(*ei)) {
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ extent_flags = btrfs_extent_flags(leaf, ei);
+ } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+ struct btrfs_extent_item_v0 *ei0;
+ BUG_ON(item_size != sizeof(*ei0));
+ ei0 = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item_v0);
+ num_refs = btrfs_extent_refs_v0(leaf, ei0);
+ /* FIXME: this isn't correct for data */
+ extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+ BUG();
+#endif
+ }
+ BUG_ON(num_refs == 0);
+ } else {
+ num_refs = 0;
+ extent_flags = 0;
+ ret = 0;
+ }
+
+ if (!trans)
+ goto out;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (head) {
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ goto again;
+ }
+ if (head->extent_op && head->extent_op->update_flags)
+ extent_flags |= head->extent_op->flags_to_set;
+ else
+ BUG_ON(num_refs == 0);
+
+ num_refs += head->node.ref_mod;
+ mutex_unlock(&head->mutex);
+ }
+ spin_unlock(&delayed_refs->lock);
+out:
+ WARN_ON(num_refs == 0);
+ if (refs)
+ *refs = num_refs;
+ if (flags)
+ *flags = extent_flags;
+out_free:
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
* Back reference rules. Back refs have three main goals:
*
* 1) differentiate between all holders of references to an extent so that
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
return ret;
}
-
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
if (insert_reserved) {
- int mark_free = 0;
- struct extent_buffer *must_clean = NULL;
-
- ret = pin_down_bytes(trans, root, NULL,
- node->bytenr, node->num_bytes,
- head->is_data, 1, &must_clean);
- if (ret > 0)
- mark_free = 1;
-
- if (must_clean) {
- clean_tree_block(NULL, root, must_clean);
- btrfs_tree_unlock(must_clean);
- free_extent_buffer(must_clean);
- }
+ btrfs_pin_extent(root, node->bytenr,
+ node->num_bytes, 1);
if (head->is_data) {
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
BUG_ON(ret);
}
- if (mark_free) {
- ret = btrfs_free_reserved_extent(root,
- node->bytenr,
- node->num_bytes);
- BUG_ON(ret);
- }
}
mutex_unlock(&head->mutex);
return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
ret = 0;
out:
btrfs_free_path(path);
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ WARN_ON(ret > 0);
return ret;
}
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
+ int i;
+ int factor;
+
+ if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
found = __find_space_info(info, flags);
if (found) {
spin_lock(&found->lock);
found->total_bytes += total_bytes;
found->bytes_used += bytes_used;
+ found->disk_used += bytes_used * factor;
found->full = 0;
spin_unlock(&found->lock);
*space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
if (!found)
return -ENOMEM;
- INIT_LIST_HEAD(&found->block_groups);
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ INIT_LIST_HEAD(&found->block_groups[i]);
init_rwsem(&found->groups_sem);
- init_waitqueue_head(&found->flush_wait);
- init_waitqueue_head(&found->allocate_wait);
spin_lock_init(&found->lock);
- found->flags = flags;
+ found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+ BTRFS_BLOCK_GROUP_SYSTEM |
+ BTRFS_BLOCK_GROUP_METADATA);
found->total_bytes = total_bytes;
found->bytes_used = bytes_used;
+ found->disk_used = bytes_used * factor;
found->bytes_pinned = 0;
found->bytes_reserved = 0;
found->bytes_readonly = 0;
- found->bytes_delalloc = 0;
+ found->bytes_may_use = 0;
found->full = 0;
found->force_alloc = 0;
*space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
}
}
-static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
-{
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- if (!cache->ro) {
- cache->space_info->bytes_readonly += cache->key.offset -
- btrfs_block_group_used(&cache->item);
- cache->ro = 1;
- }
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-}
-
u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
return flags;
}
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
-{
- struct btrfs_fs_info *info = root->fs_info;
- u64 alloc_profile;
-
- if (data) {
- alloc_profile = info->avail_data_alloc_bits &
- info->data_alloc_profile;
- data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
- } else if (root == root->fs_info->chunk_root) {
- alloc_profile = info->avail_system_alloc_bits &
- info->system_alloc_profile;
- data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
- } else {
- alloc_profile = info->avail_metadata_alloc_bits &
- info->metadata_alloc_profile;
- data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
- }
-
- return btrfs_reduce_alloc_profile(root, data);
-}
-
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
- u64 alloc_target;
-
- alloc_target = btrfs_get_alloc_profile(root, 1);
- BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
- alloc_target);
-}
-
-static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
-{
- u64 num_bytes;
- int level;
-
- level = BTRFS_MAX_LEVEL - 2;
- /*
- * NOTE: these calculations are absolutely the worst possible case.
- * This assumes that _every_ item we insert will require a new leaf, and
- * that the tree has grown to its maximum level size.
- */
-
- /*
- * for every item we insert we could insert both an extent item and a
- * extent ref item. Then for ever item we insert, we will need to cow
- * both the original leaf, plus the leaf to the left and right of it.
- *
- * Unless we are talking about the extent root, then we just want the
- * number of items * 2, since we just need the extent item plus its ref.
- */
- if (root == root->fs_info->extent_root)
- num_bytes = num_items * 2;
- else
- num_bytes = (num_items + (2 * num_items)) * 3;
-
- /*
- * num_bytes is total number of leaves we could need times the leaf
- * size, and then for every leaf we could end up cow'ing 2 nodes per
- * level, down to the leaf level.
- */
- num_bytes = (num_bytes * root->leafsize) +
- (num_bytes * (level * 2)) * root->nodesize;
-
- return num_bytes;
-}
-
-/*
- * Unreserve metadata space for delalloc. If we have less reserved credits than
- * we have extents, this function does nothing.
- */
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 alloc_target;
- bool bug = false;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
- num_items);
-
- spin_lock(&meta_sinfo->lock);
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- if (BTRFS_I(inode)->reserved_extents <=
- BTRFS_I(inode)->outstanding_extents) {
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- spin_unlock(&meta_sinfo->lock);
- return 0;
- }
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
- BTRFS_I(inode)->reserved_extents -= num_items;
- BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
-
- if (meta_sinfo->bytes_delalloc < num_bytes) {
- bug = true;
- meta_sinfo->bytes_delalloc = 0;
- } else {
- meta_sinfo->bytes_delalloc -= num_bytes;
- }
- spin_unlock(&meta_sinfo->lock);
-
- BUG_ON(bug);
-
- return 0;
-}
-
-static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
{
- u64 thresh;
-
- thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- meta_sinfo->bytes_may_use;
-
- thresh = meta_sinfo->total_bytes - thresh;
- thresh *= 80;
- do_div(thresh, 100);
- if (thresh <= meta_sinfo->bytes_delalloc)
- meta_sinfo->force_delalloc = 1;
- else
- meta_sinfo->force_delalloc = 0;
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ flags |= root->fs_info->avail_data_alloc_bits &
+ root->fs_info->data_alloc_profile;
+ else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ flags |= root->fs_info->avail_system_alloc_bits &
+ root->fs_info->system_alloc_profile;
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ flags |= root->fs_info->avail_metadata_alloc_bits &
+ root->fs_info->metadata_alloc_profile;
+ return btrfs_reduce_alloc_profile(root, flags);
}
-struct async_flush {
- struct btrfs_root *root;
- struct btrfs_space_info *info;
- struct btrfs_work work;
-};
-
-static noinline void flush_delalloc_async(struct btrfs_work *work)
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
{
- struct async_flush *async;
- struct btrfs_root *root;
- struct btrfs_space_info *info;
-
- async = container_of(work, struct async_flush, work);
- root = async->root;
- info = async->info;
-
- btrfs_start_delalloc_inodes(root, 0);
- wake_up(&info->flush_wait);
- btrfs_wait_ordered_extents(root, 0, 0);
-
- spin_lock(&info->lock);
- info->flushing = 0;
- spin_unlock(&info->lock);
- wake_up(&info->flush_wait);
-
- kfree(async);
-}
-
-static void wait_on_flush(struct btrfs_space_info *info)
-{
- DEFINE_WAIT(wait);
- u64 used;
-
- while (1) {
- prepare_to_wait(&info->flush_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&info->lock);
- if (!info->flushing) {
- spin_unlock(&info->lock);
- break;
- }
-
- used = info->bytes_used + info->bytes_reserved +
- info->bytes_pinned + info->bytes_readonly +
- info->bytes_super + info->bytes_root +
- info->bytes_may_use + info->bytes_delalloc;
- if (used < info->total_bytes) {
- spin_unlock(&info->lock);
- break;
- }
- spin_unlock(&info->lock);
- schedule();
- }
- finish_wait(&info->flush_wait, &wait);
-}
-
-static void flush_delalloc(struct btrfs_root *root,
- struct btrfs_space_info *info)
-{
- struct async_flush *async;
- bool wait = false;
-
- spin_lock(&info->lock);
+ u64 flags;
- if (!info->flushing)
- info->flushing = 1;
+ if (data)
+ flags = BTRFS_BLOCK_GROUP_DATA;
+ else if (root == root->fs_info->chunk_root)
+ flags = BTRFS_BLOCK_GROUP_SYSTEM;
else
- wait = true;
-
- spin_unlock(&info->lock);
-
- if (wait) {
- wait_on_flush(info);
- return;
- }
-
- async = kzalloc(sizeof(*async), GFP_NOFS);
- if (!async)
- goto flush;
-
- async->root = root;
- async->info = info;
- async->work.func = flush_delalloc_async;
+ flags = BTRFS_BLOCK_GROUP_METADATA;
- btrfs_queue_worker(&root->fs_info->enospc_workers,
- &async->work);
- wait_on_flush(info);
- return;
-
-flush:
- btrfs_start_delalloc_inodes(root, 0);
- btrfs_wait_ordered_extents(root, 0, 0);
-
- spin_lock(&info->lock);
- info->flushing = 0;
- spin_unlock(&info->lock);
- wake_up(&info->flush_wait);
+ return get_alloc_profile(root, flags);
}
-static int maybe_allocate_chunk(struct btrfs_root *root,
- struct btrfs_space_info *info)
-{
- struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
- struct btrfs_trans_handle *trans;
- bool wait = false;
- int ret = 0;
- u64 min_metadata;
- u64 free_space;
-
- free_space = btrfs_super_total_bytes(disk_super);
- /*
- * we allow the metadata to grow to a max of either 10gb or 5% of the
- * space in the volume.
- */
- min_metadata = min((u64)10 * 1024 * 1024 * 1024,
- div64_u64(free_space * 5, 100));
- if (info->total_bytes >= min_metadata) {
- spin_unlock(&info->lock);
- return 0;
- }
-
- if (info->full) {
- spin_unlock(&info->lock);
- return 0;
- }
-
- if (!info->allocating_chunk) {
- info->force_alloc = 1;
- info->allocating_chunk = 1;
- } else {
- wait = true;
- }
-
- spin_unlock(&info->lock);
-
- if (wait) {
- wait_event(info->allocate_wait,
- !info->allocating_chunk);
- return 1;
- }
-
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 4096 + 2 * 1024 * 1024,
- info->flags, 0);
- btrfs_end_transaction(trans, root);
- if (ret)
- goto out;
-out:
- spin_lock(&info->lock);
- info->allocating_chunk = 0;
- spin_unlock(&info->lock);
- wake_up(&info->allocate_wait);
-
- if (ret)
- return 0;
- return 1;
-}
-
-/*
- * Reserve metadata space for delalloc.
- */
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
- struct inode *inode, int num_items)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 used;
- u64 alloc_target;
- int flushed = 0;
- int force_delalloc;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
- num_items);
-again:
- spin_lock(&meta_sinfo->lock);
-
- force_delalloc = meta_sinfo->force_delalloc;
-
- if (unlikely(!meta_sinfo->bytes_root))
- meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-
- if (!flushed)
- meta_sinfo->bytes_delalloc += num_bytes;
-
- used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-
- if (used > meta_sinfo->total_bytes) {
- flushed++;
-
- if (flushed == 1) {
- if (maybe_allocate_chunk(root, meta_sinfo))
- goto again;
- flushed++;
- } else {
- spin_unlock(&meta_sinfo->lock);
- }
-
- if (flushed == 2) {
- filemap_flush(inode->i_mapping);
- goto again;
- } else if (flushed == 3) {
- flush_delalloc(root, meta_sinfo);
- goto again;
- }
- spin_lock(&meta_sinfo->lock);
- meta_sinfo->bytes_delalloc -= num_bytes;
- spin_unlock(&meta_sinfo->lock);
- printk(KERN_ERR "enospc, has %d, reserved %d\n",
- BTRFS_I(inode)->outstanding_extents,
- BTRFS_I(inode)->reserved_extents);
- dump_space_info(meta_sinfo, 0, 0);
- return -ENOSPC;
- }
-
- BTRFS_I(inode)->reserved_extents += num_items;
- check_force_delalloc(meta_sinfo);
- spin_unlock(&meta_sinfo->lock);
-
- if (!flushed && force_delalloc)
- filemap_flush(inode->i_mapping);
-
- return 0;
-}
-
-/*
- * unreserve num_items number of items worth of metadata space. This needs to
- * be paired with btrfs_reserve_metadata_space.
- *
- * NOTE: if you have the option, run this _AFTER_ you do a
- * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
- * oprations which will result in more used metadata, so we want to make sure we
- * can do that without issue.
- */
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
-{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 alloc_target;
- bool bug = false;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root, num_items);
-
- spin_lock(&meta_sinfo->lock);
- if (meta_sinfo->bytes_may_use < num_bytes) {
- bug = true;
- meta_sinfo->bytes_may_use = 0;
- } else {
- meta_sinfo->bytes_may_use -= num_bytes;
- }
- spin_unlock(&meta_sinfo->lock);
-
- BUG_ON(bug);
-
- return 0;
-}
-
-/*
- * Reserve some metadata space for use. We'll calculate the worste case number
- * of bytes that would be needed to modify num_items number of items. If we
- * have space, fantastic, if not, you get -ENOSPC. Please call
- * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
- * items you reserved, since whatever metadata you needed should have already
- * been allocated.
- *
- * This will commit the transaction to make more space if we don't have enough
- * metadata space. THe only time we don't do this is if we're reserving space
- * inside of a transaction, then we will just return -ENOSPC and it is the
- * callers responsibility to handle it properly.
- */
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
{
- struct btrfs_fs_info *info = root->fs_info;
- struct btrfs_space_info *meta_sinfo;
- u64 num_bytes;
- u64 used;
- u64 alloc_target;
- int retries = 0;
-
- /* get the space info for where the metadata will live */
- alloc_target = btrfs_get_alloc_profile(root, 0);
- meta_sinfo = __find_space_info(info, alloc_target);
-
- num_bytes = calculate_bytes_needed(root, num_items);
-again:
- spin_lock(&meta_sinfo->lock);
-
- if (unlikely(!meta_sinfo->bytes_root))
- meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-
- if (!retries)
- meta_sinfo->bytes_may_use += num_bytes;
-
- used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
- meta_sinfo->bytes_super + meta_sinfo->bytes_root +
- meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-
- if (used > meta_sinfo->total_bytes) {
- retries++;
- if (retries == 1) {
- if (maybe_allocate_chunk(root, meta_sinfo))
- goto again;
- retries++;
- } else {
- spin_unlock(&meta_sinfo->lock);
- }
-
- if (retries == 2) {
- flush_delalloc(root, meta_sinfo);
- goto again;
- }
- spin_lock(&meta_sinfo->lock);
- meta_sinfo->bytes_may_use -= num_bytes;
- spin_unlock(&meta_sinfo->lock);
-
- dump_space_info(meta_sinfo, 0, 0);
- return -ENOSPC;
- }
-
- check_force_delalloc(meta_sinfo);
- spin_unlock(&meta_sinfo->lock);
-
- return 0;
+ BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+ BTRFS_BLOCK_GROUP_DATA);
}
/*
* This will check the space that the inode allocates from to make sure we have
* enough space for bytes.
*/
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
u64 used;
- int ret = 0, committed = 0, flushed = 0;
+ int ret = 0, committed = 0;
/* make sure bytes are sectorsize aligned */
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
again:
/* make sure we have enough space to handle the data first */
spin_lock(&data_sinfo->lock);
- used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
- data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
- data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
- data_sinfo->bytes_super;
+ used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
+ data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
+ data_sinfo->bytes_may_use;
if (used + bytes > data_sinfo->total_bytes) {
struct btrfs_trans_handle *trans;
- if (!flushed) {
- spin_unlock(&data_sinfo->lock);
- flush_delalloc(root, data_sinfo);
- flushed = 1;
- goto again;
- }
-
/*
* if we don't have enough free bytes in this space then we need
* to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
spin_unlock(&data_sinfo->lock);
alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+ trans = btrfs_join_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
bytes + 2 * 1024 * 1024,
alloc_target, 0);
btrfs_end_transaction(trans, root);
- if (ret)
+ if (ret < 0)
return ret;
if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
if (!committed && !root->fs_info->open_ioctl_trans) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
ret = btrfs_commit_transaction(trans, root);
if (ret)
return ret;
goto again;
}
- printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
- ", %llu bytes_used, %llu bytes_reserved, "
- "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
- "%llu total\n", (unsigned long long)bytes,
- (unsigned long long)data_sinfo->bytes_delalloc,
+#if 0 /* I hope we never need this code again, just in case */
+ printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
+ "%llu bytes_reserved, " "%llu bytes_pinned, "
+ "%llu bytes_readonly, %llu may use %llu total\n",
+ (unsigned long long)bytes,
(unsigned long long)data_sinfo->bytes_used,
(unsigned long long)data_sinfo->bytes_reserved,
(unsigned long long)data_sinfo->bytes_pinned,
(unsigned long long)data_sinfo->bytes_readonly,
(unsigned long long)data_sinfo->bytes_may_use,
(unsigned long long)data_sinfo->total_bytes);
+#endif
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
}
/*
- * if there was an error for whatever reason after calling
- * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ * called when we are clearing an delalloc extent from the
+ * inode's io_tree or there was an error for whatever reason
+ * after calling btrfs_check_data_free_space
*/
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
- struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_space_info *data_sinfo;
/* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
spin_unlock(&data_sinfo->lock);
}
-/* called when we are adding a delalloc extent to the inode's io_tree */
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes)
-{
- struct btrfs_space_info *data_sinfo;
-
- /* get the space info for where this inode will be storing its data */
- data_sinfo = BTRFS_I(inode)->space_info;
-
- /* make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- data_sinfo->bytes_delalloc += bytes;
-
- /*
- * we are adding a delalloc extent without calling
- * btrfs_check_data_free_space first. This happens on a weird
- * writepage condition, but shouldn't hurt our accounting
- */
- if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
- data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
- BTRFS_I(inode)->reserved_bytes = 0;
- } else {
- data_sinfo->bytes_may_use -= bytes;
- BTRFS_I(inode)->reserved_bytes -= bytes;
- }
-
- spin_unlock(&data_sinfo->lock);
-}
-
-/* called when we are clearing an delalloc extent from the inode's io_tree */
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
- u64 bytes)
-{
- struct btrfs_space_info *info;
-
- info = BTRFS_I(inode)->space_info;
-
- spin_lock(&info->lock);
- info->bytes_delalloc -= bytes;
- spin_unlock(&info->lock);
-}
-
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
rcu_read_unlock();
}
+static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+ u64 alloc_bytes)
+{
+ u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved +
+ alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+ return 0;
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved +
+ alloc_bytes < div_factor(num_bytes, 8))
+ return 0;
+
+ return 1;
+}
+
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force)
{
struct btrfs_space_info *space_info;
struct btrfs_fs_info *fs_info = extent_root->fs_info;
- u64 thresh;
int ret = 0;
mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
goto out;
}
- thresh = space_info->total_bytes - space_info->bytes_readonly;
- thresh = div_factor(thresh, 8);
- if (!force &&
- (space_info->bytes_used + space_info->bytes_pinned +
- space_info->bytes_reserved + alloc_bytes) < thresh) {
+ if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
spin_unlock(&space_info->lock);
goto out;
}
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
+ else
+ ret = 1;
space_info->force_alloc = 0;
spin_unlock(&space_info->lock);
out:
@@ -3461,13 +3073,713 @@ out:
return ret;
}
+static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_space_info *sinfo, u64 num_bytes)
+{
+ int ret;
+ int end_trans = 0;
+
+ if (sinfo->full)
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+ spin_unlock(&sinfo->lock);
+ if (!ret)
+ return 0;
+
+ if (!trans) {
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ end_trans = 1;
+ }
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ num_bytes + 2 * 1024 * 1024,
+ get_alloc_profile(root, sinfo->flags), 0);
+
+ if (end_trans)
+ btrfs_end_transaction(trans, root);
+
+ return ret == 1 ? 1 : 0;
+}
+
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 to_reclaim)
+{
+ struct btrfs_block_rsv *block_rsv;
+ u64 reserved;
+ u64 max_reclaim;
+ u64 reclaimed = 0;
+ int pause = 1;
+ int ret;
+
+ block_rsv = &root->fs_info->delalloc_block_rsv;
+ spin_lock(&block_rsv->lock);
+ reserved = block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (reserved == 0)
+ return 0;
+
+ max_reclaim = min(reserved, to_reclaim);
+
+ while (1) {
+ ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+ if (!ret) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(pause);
+ pause <<= 1;
+ if (pause > HZ / 10)
+ pause = HZ / 10;
+ } else {
+ pause = 1;
+ }
+
+ spin_lock(&block_rsv->lock);
+ if (reserved > block_rsv->reserved)
+ reclaimed = reserved - block_rsv->reserved;
+ reserved = block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+
+ if (reserved == 0 || reclaimed >= max_reclaim)
+ break;
+
+ if (trans && trans->transaction->blocked)
+ return -EAGAIN;
+ }
+ return reclaimed >= to_reclaim;
+}
+
+static int should_retry_reserve(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int *retries)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+ int ret;
+
+ if ((*retries) > 2)
+ return -ENOSPC;
+
+ ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+ if (ret)
+ return 1;
+
+ if (trans && trans->transaction->in_commit)
+ return -ENOSPC;
+
+ ret = shrink_delalloc(trans, root, num_bytes);
+ if (ret)
+ return ret;
+
+ spin_lock(&space_info->lock);
+ if (space_info->bytes_pinned < num_bytes)
+ ret = 1;
+ spin_unlock(&space_info->lock);
+ if (ret)
+ return -ENOSPC;
+
+ (*retries)++;
+
+ if (trans)
+ return -EAGAIN;
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+
+ return 1;
+}
+
+static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+ u64 unused;
+ int ret = -ENOSPC;
+
+ spin_lock(&space_info->lock);
+ unused = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly;
+
+ if (unused < space_info->total_bytes)
+ unused = space_info->total_bytes - unused;
+ else
+ unused = 0;
+
+ if (unused >= num_bytes) {
+ if (block_rsv->priority >= 10) {
+ space_info->bytes_reserved += num_bytes;
+ ret = 0;
+ } else {
+ if ((unused + block_rsv->reserved) *
+ block_rsv->priority >=
+ (num_bytes + block_rsv->reserved) * 10) {
+ space_info->bytes_reserved += num_bytes;
+ ret = 0;
+ }
+ }
+ }
+ spin_unlock(&space_info->lock);
+
+ return ret;
+}
+
+static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *block_rsv;
+ if (root->ref_cows)
+ block_rsv = trans->block_rsv;
+ else
+ block_rsv = root->block_rsv;
+
+ if (!block_rsv)
+ block_rsv = &root->fs_info->empty_block_rsv;
+
+ return block_rsv;
+}
+
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ int ret = -ENOSPC;
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved >= num_bytes) {
+ block_rsv->reserved -= num_bytes;
+ if (block_rsv->reserved < block_rsv->size)
+ block_rsv->full = 0;
+ ret = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+ return ret;
+}
+
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int update_size)
+{
+ spin_lock(&block_rsv->lock);
+ block_rsv->reserved += num_bytes;
+ if (update_size)
+ block_rsv->size += num_bytes;
+ else if (block_rsv->reserved >= block_rsv->size)
+ block_rsv->full = 1;
+ spin_unlock(&block_rsv->lock);
+}
+
+void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+ struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+ struct btrfs_space_info *space_info = block_rsv->space_info;
+
+ spin_lock(&block_rsv->lock);
+ if (num_bytes == (u64)-1)
+ num_bytes = block_rsv->size;
+ block_rsv->size -= num_bytes;
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = 1;
+ } else {
+ num_bytes = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (num_bytes > 0) {
+ if (dest) {
+ block_rsv_add_bytes(dest, num_bytes, 0);
+ } else {
+ spin_lock(&space_info->lock);
+ space_info->bytes_reserved -= num_bytes;
+ spin_unlock(&space_info->lock);
+ }
+ }
+}
+
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+ struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+ int ret;
+
+ ret = block_rsv_use_bytes(src, num_bytes);
+ if (ret)
+ return ret;
+
+ block_rsv_add_bytes(dst, num_bytes, 1);
+ return 0;
+}
+
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+{
+ memset(rsv, 0, sizeof(*rsv));
+ spin_lock_init(&rsv->lock);
+ atomic_set(&rsv->usage, 1);
+ rsv->priority = 6;
+ INIT_LIST_HEAD(&rsv->list);
+}
+
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+{
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 alloc_target;
+
+ block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+ if (!block_rsv)
+ return NULL;
+
+ btrfs_init_block_rsv(block_rsv);
+
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ block_rsv->space_info = __find_space_info(fs_info,
+ BTRFS_BLOCK_GROUP_METADATA);
+
+ return block_rsv;
+}
+
+void btrfs_free_block_rsv(struct btrfs_root *root,
+ struct btrfs_block_rsv *rsv)
+{
+ if (rsv && atomic_dec_and_test(&rsv->usage)) {
+ btrfs_block_rsv_release(root, rsv, (u64)-1);
+ if (!rsv->durable)
+ kfree(rsv);
+ }
+}
+
+/*
+ * make the block_rsv struct be able to capture freed space.
+ * the captured space will re-add to the the block_rsv struct
+ * after transaction commit
+ */
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_rsv *block_rsv)
+{
+ block_rsv->durable = 1;
+ mutex_lock(&fs_info->durable_block_rsv_mutex);
+ list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+ mutex_unlock(&fs_info->durable_block_rsv_mutex);
+}
+
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes, int *retries)
+{
+ int ret;
+
+ if (num_bytes == 0)
+ return 0;
+again:
+ ret = reserve_metadata_bytes(block_rsv, num_bytes);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 1);
+ return 0;
+ }
+
+ ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+ if (ret > 0)
+ goto again;
+
+ return ret;
+}
+
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 min_reserved, int min_factor)
+{
+ u64 num_bytes = 0;
+ int commit_trans = 0;
+ int ret = -ENOSPC;
+
+ if (!block_rsv)
+ return 0;
+
+ spin_lock(&block_rsv->lock);
+ if (min_factor > 0)
+ num_bytes = div_factor(block_rsv->size, min_factor);
+ if (min_reserved > num_bytes)
+ num_bytes = min_reserved;
+
+ if (block_rsv->reserved >= num_bytes) {
+ ret = 0;
+ } else {
+ num_bytes -= block_rsv->reserved;
+ if (block_rsv->durable &&
+ block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+ commit_trans = 1;
+ }
+ spin_unlock(&block_rsv->lock);
+ if (!ret)
+ return 0;
+
+ if (block_rsv->refill_used) {
+ ret = reserve_metadata_bytes(block_rsv, num_bytes);
+ if (!ret) {
+ block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ return 0;
+ }
+ }
+
+ if (commit_trans) {
+ if (trans)
+ return -EAGAIN;
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ ret = btrfs_commit_transaction(trans, root);
+ return 0;
+ }
+
+ WARN_ON(1);
+ printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+ block_rsv->size, block_rsv->reserved,
+ block_rsv->freed[0], block_rsv->freed[1]);
+
+ return -ENOSPC;
+}
+
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ struct btrfs_block_rsv *dst_rsv,
+ u64 num_bytes)
+{
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_block_rsv_release(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 num_bytes)
+{
+ struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+ if (global_rsv->full || global_rsv == block_rsv ||
+ block_rsv->space_info != global_rsv->space_info)
+ global_rsv = NULL;
+ block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+}
+
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *sinfo;
+ u64 num_bytes;
+ u64 meta_used;
+ u64 data_used;
+ int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+#if 0
+ /*
+ * per tree used space accounting can be inaccuracy, so we
+ * can't rely on it.
+ */
+ spin_lock(&fs_info->extent_root->accounting_lock);
+ num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+ spin_unlock(&fs_info->extent_root->accounting_lock);
+
+ spin_lock(&fs_info->csum_root->accounting_lock);
+ num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+ spin_unlock(&fs_info->csum_root->accounting_lock);
+
+ spin_lock(&fs_info->tree_root->accounting_lock);
+ num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+ spin_unlock(&fs_info->tree_root->accounting_lock);
+#endif
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+ spin_lock(&sinfo->lock);
+ data_used = sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+
+ sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ spin_lock(&sinfo->lock);
+ meta_used = sinfo->bytes_used;
+ spin_unlock(&sinfo->lock);
+
+ num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+ csum_size * 2;
+ num_bytes += div64_u64(data_used + meta_used, 50);
+
+ if (num_bytes * 3 > meta_used)
+ num_bytes = div64_u64(meta_used, 3);
+
+ return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+}
+
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ struct btrfs_space_info *sinfo = block_rsv->space_info;
+ u64 num_bytes;
+
+ num_bytes = calc_global_metadata_size(fs_info);
+
+ spin_lock(&block_rsv->lock);
+ spin_lock(&sinfo->lock);
+
+ block_rsv->size = num_bytes;
+
+ num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+ sinfo->bytes_reserved + sinfo->bytes_readonly;
+
+ if (sinfo->total_bytes > num_bytes) {
+ num_bytes = sinfo->total_bytes - num_bytes;
+ block_rsv->reserved += num_bytes;
+ sinfo->bytes_reserved += num_bytes;
+ }
+
+ if (block_rsv->reserved >= block_rsv->size) {
+ num_bytes = block_rsv->reserved - block_rsv->size;
+ sinfo->bytes_reserved -= num_bytes;
+ block_rsv->reserved = block_rsv->size;
+ block_rsv->full = 1;
+ }
+#if 0
+ printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+ block_rsv->size, block_rsv->reserved);
+#endif
+ spin_unlock(&sinfo->lock);
+ spin_unlock(&block_rsv->lock);
+}
+
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *space_info;
+
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+ fs_info->chunk_block_rsv.space_info = space_info;
+ fs_info->chunk_block_rsv.priority = 10;
+
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ fs_info->global_block_rsv.space_info = space_info;
+ fs_info->global_block_rsv.priority = 10;
+ fs_info->global_block_rsv.refill_used = 1;
+ fs_info->delalloc_block_rsv.space_info = space_info;
+ fs_info->trans_block_rsv.space_info = space_info;
+ fs_info->empty_block_rsv.space_info = space_info;
+ fs_info->empty_block_rsv.priority = 10;
+
+ fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+ fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+
+ btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+
+ btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+
+ update_global_block_rsv(fs_info);
+}
+
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+ block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+ WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+ WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+ WARN_ON(fs_info->trans_block_rsv.size > 0);
+ WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+ WARN_ON(fs_info->chunk_block_rsv.size > 0);
+ WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+}
+
+static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+{
+ return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+ 3 * num_items;
+}
+
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ int num_items, int *retries)
+{
+ u64 num_bytes;
+ int ret;
+
+ if (num_items == 0 || root->fs_info->chunk_root == root)
+ return 0;
+
+ num_bytes = calc_trans_metadata_size(root, num_items);
+ ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+ num_bytes, retries);
+ if (!ret) {
+ trans->bytes_reserved += num_bytes;
+ trans->block_rsv = &root->fs_info->trans_block_rsv;
+ }
+ return ret;
+}
+
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (!trans->bytes_reserved)
+ return;
+
+ BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+ btrfs_block_rsv_release(root, trans->block_rsv,
+ trans->bytes_reserved);
+ trans->bytes_reserved = 0;
+}
+
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+
+ /*
+ * one for deleting orphan item, one for updating inode and
+ * two for calling btrfs_truncate_inode_items.
+ *
+ * btrfs_truncate_inode_items is a delete operation, it frees
+ * more space than it uses in most cases. So two units of
+ * metadata space should be enough for calling it many times.
+ * If all of the metadata space is used, we can commit
+ * transaction and use space it freed.
+ */
+ u64 num_bytes = calc_trans_metadata_size(root, 4);
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 num_bytes = calc_trans_metadata_size(root, 4);
+ btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+ /*
+ * two for root back/forward refs, two for directory entries
+ * and one for root of the snapshot.
+ */
+ u64 num_bytes = calc_trans_metadata_size(root, 5);
+ dst_rsv->space_info = src_rsv->space_info;
+ return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+{
+ return num_bytes >>= 3;
+}
+
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+ u64 to_reserve;
+ int nr_extents;
+ int retries = 0;
+ int ret;
+
+ if (btrfs_transaction_in_commit(root->fs_info))
+ schedule_timeout(1);
+
+ num_bytes = ALIGN(num_bytes, root->sectorsize);
+again:
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+ if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+ nr_extents -= BTRFS_I(inode)->reserved_extents;
+ to_reserve = calc_trans_metadata_size(root, nr_extents);
+ } else {
+ nr_extents = 0;
+ to_reserve = 0;
+ }
+
+ to_reserve += calc_csum_metadata_size(inode, num_bytes);
+ ret = reserve_metadata_bytes(block_rsv, to_reserve);
+ if (ret) {
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+ ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+ &retries);
+ if (ret > 0)
+ goto again;
+ return ret;
+ }
+
+ BTRFS_I(inode)->reserved_extents += nr_extents;
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ block_rsv_add_bytes(block_rsv, to_reserve, 1);
+
+ if (block_rsv->size > 512 * 1024 * 1024)
+ shrink_delalloc(NULL, root, to_reserve);
+
+ return 0;
+}
+
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 to_free;
+ int nr_extents;
+
+ num_bytes = ALIGN(num_bytes, root->sectorsize);
+ atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+ spin_lock(&BTRFS_I(inode)->accounting_lock);
+ nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+ if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+ nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+ BTRFS_I(inode)->reserved_extents -= nr_extents;
+ } else {
+ nr_extents = 0;
+ }
+ spin_unlock(&BTRFS_I(inode)->accounting_lock);
+
+ to_free = calc_csum_metadata_size(inode, num_bytes);
+ if (nr_extents > 0)
+ to_free += calc_trans_metadata_size(root, nr_extents);
+
+ btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+ to_free);
+}
+
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+ int ret;
+
+ ret = btrfs_check_data_free_space(inode, num_bytes);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+ if (ret) {
+ btrfs_free_reserved_data_space(inode, num_bytes);
+ return ret;
+ }
+
+ return 0;
+}
+
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+ btrfs_delalloc_release_metadata(inode, num_bytes);
+ btrfs_free_reserved_data_space(inode, num_bytes);
+}
+
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc,
- int mark_free)
+ u64 bytenr, u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *info = root->fs_info;
+ int factor;
u64 total = num_bytes;
u64 old_val;
u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache)
return -1;
+ if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID10))
+ factor = 2;
+ else
+ factor = 1;
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
old_val += num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
cache->reserved -= num_bytes;
- cache->space_info->bytes_used += num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
- if (cache->ro)
- cache->space_info->bytes_readonly -= num_bytes;
+ cache->space_info->bytes_used += num_bytes;
+ cache->space_info->disk_used += num_bytes * factor;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
} else {
old_val -= num_bytes;
- cache->space_info->bytes_used -= num_bytes;
- if (cache->ro)
- cache->space_info->bytes_readonly += num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ cache->space_info->bytes_used -= num_bytes;
+ cache->space_info->disk_used -= num_bytes * factor;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- if (mark_free) {
- int ret;
- ret = btrfs_discard_extent(root, bytenr,
- num_bytes);
- WARN_ON(ret);
-
- ret = btrfs_add_free_space(cache, bytenr,
- num_bytes);
- WARN_ON(ret);
- }
+ set_extent_dirty(info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1,
+ GFP_NOFS | __GFP_NOFAIL);
}
btrfs_put_block_group(cache);
total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return bytenr;
}
-/*
- * this function must be called within transaction
- */
-int btrfs_pin_extent(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int reserved)
+static int pin_down_extent(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache,
+ u64 bytenr, u64 num_bytes, int reserved)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_group_cache *cache;
-
- cache = btrfs_lookup_block_group(fs_info, bytenr);
- BUG_ON(!cache);
-
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- btrfs_put_block_group(cache);
+ set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+ return 0;
+}
+
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+ BUG_ON(!cache);
+
+ pin_down_extent(root, cache, bytenr, num_bytes, reserved);
- set_extent_dirty(fs_info->pinned_extents,
- bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ btrfs_put_block_group(cache);
return 0;
}
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve)
+/*
+ * update size of reserved extents. this function may return -EAGAIN
+ * if 'reserve' is true or 'sinfo' is false.
+ */
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve, int sinfo)
{
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- if (reserve) {
- cache->reserved += num_bytes;
- cache->space_info->bytes_reserved += num_bytes;
+ int ret = 0;
+ if (sinfo) {
+ struct btrfs_space_info *space_info = cache->space_info;
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve) {
+ if (cache->ro) {
+ ret = -EAGAIN;
+ } else {
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ }
+ } else {
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
} else {
- cache->reserved -= num_bytes;
- cache->space_info->bytes_reserved -= num_bytes;
+ spin_lock(&cache->lock);
+ if (cache->ro) {
+ ret = -EAGAIN;
+ } else {
+ if (reserve)
+ cache->reserved += num_bytes;
+ else
+ cache->reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
}
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- return 0;
+ return ret;
}
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
fs_info->pinned_extents = &fs_info->freed_extents[0];
up_write(&fs_info->extent_commit_sem);
+
+ update_global_block_rsv(fs_info);
return 0;
}
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
btrfs_add_free_space(cache, start, len);
}
+ start += len;
+
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
cache->space_info->bytes_pinned -= len;
+ if (cache->ro) {
+ cache->space_info->bytes_readonly += len;
+ } else if (cache->reserved_pinned > 0) {
+ len = min(len, cache->reserved_pinned);
+ cache->reserved_pinned -= len;
+ cache->space_info->bytes_reserved += len;
+ }
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
-
- start += len;
}
if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *unpin;
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_rsv *next_rsv;
u64 start;
u64 end;
+ int idx;
int ret;
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
cond_resched();
}
- return ret;
-}
+ mutex_lock(&fs_info->durable_block_rsv_mutex);
+ list_for_each_entry_safe(block_rsv, next_rsv,
+ &fs_info->durable_block_rsv_list, list) {
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- u64 bytenr, u64 num_bytes,
- int is_data, int reserved,
- struct extent_buffer **must_clean)
-{
- int err = 0;
- struct extent_buffer *buf;
-
- if (is_data)
- goto pinit;
-
- /*
- * discard is sloooow, and so triggering discards on
- * individual btree blocks isn't a good plan. Just
- * pin everything in discard mode.
- */
- if (btrfs_test_opt(root, DISCARD))
- goto pinit;
-
- buf = btrfs_find_tree_block(root, bytenr, num_bytes);
- if (!buf)
- goto pinit;
+ idx = trans->transid & 0x1;
+ if (block_rsv->freed[idx] > 0) {
+ block_rsv_add_bytes(block_rsv,
+ block_rsv->freed[idx], 0);
+ block_rsv->freed[idx] = 0;
+ }
+ if (atomic_read(&block_rsv->usage) == 0) {
+ btrfs_block_rsv_release(root, block_rsv, (u64)-1);
- /* we can reuse a block if it hasn't been written
- * and it is from this transaction. We can't
- * reuse anything from the tree log root because
- * it has tiny sub-transactions.
- */
- if (btrfs_buffer_uptodate(buf, 0) &&
- btrfs_try_tree_lock(buf)) {
- u64 header_owner = btrfs_header_owner(buf);
- u64 header_transid = btrfs_header_generation(buf);
- if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
- header_transid == trans->transid &&
- !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- *must_clean = buf;
- return 1;
+ if (block_rsv->freed[0] == 0 &&
+ block_rsv->freed[1] == 0) {
+ list_del_init(&block_rsv->list);
+ kfree(block_rsv);
+ }
+ } else {
+ btrfs_block_rsv_release(root, block_rsv, 0);
}
- btrfs_tree_unlock(buf);
}
- free_extent_buffer(buf);
-pinit:
- if (path)
- btrfs_set_path_blocking(path);
- /* unlocks the pinned mutex */
- btrfs_pin_extent(root, bytenr, num_bytes, reserved);
+ mutex_unlock(&fs_info->durable_block_rsv_mutex);
- BUG_ON(err < 0);
return 0;
}
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
} else {
- int mark_free = 0;
- struct extent_buffer *must_clean = NULL;
-
if (found_extent) {
BUG_ON(is_data && refs_to_drop !=
extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = pin_down_bytes(trans, root, path, bytenr,
- num_bytes, is_data, 0, &must_clean);
- if (ret > 0)
- mark_free = 1;
- BUG_ON(ret < 0);
- /*
- * it is going to be very rare for someone to be waiting
- * on the block we're freeing. del_items might need to
- * schedule, so rather than get fancy, just force it
- * to blocking here
- */
- if (must_clean)
- btrfs_set_lock_blocking(must_clean);
-
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
BUG_ON(ret);
btrfs_release_path(extent_root, path);
- if (must_clean) {
- clean_tree_block(NULL, root, must_clean);
- btrfs_tree_unlock(must_clean);
- free_extent_buffer(must_clean);
- }
-
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
(bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
}
- ret = update_block_group(trans, root, bytenr, num_bytes, 0,
- mark_free);
+ ret = update_block_group(trans, root, bytenr, num_bytes, 0);
BUG_ON(ret);
}
btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
/*
- * when we free an extent, it is possible (and likely) that we free the last
+ * when we free an block, it is possible (and likely) that we free the last
* delayed ref for that extent as well. This searches the delayed ref tree for
* a given extent, and if there are no other delayed refs to be processed, it
* removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct rb_node *node;
- int ret;
+ int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,99 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
list_del_init(&head->cluster);
spin_unlock(&delayed_refs->lock);
- ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
- &head->node, head->extent_op,
- head->must_insert_reserved);
- BUG_ON(ret);
+ BUG_ON(head->extent_op);
+ if (head->must_insert_reserved)
+ ret = 1;
+
+ mutex_unlock(&head->mutex);
btrfs_put_delayed_ref(&head->node);
- return 0;
+ return ret;
out:
spin_unlock(&delayed_refs->lock);
return 0;
}
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf,
+ u64 parent, int last_ref)
+{
+ struct btrfs_block_rsv *block_rsv;
+ struct btrfs_block_group_cache *cache = NULL;
+ int ret;
+
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+ parent, root->root_key.objectid,
+ btrfs_header_level(buf),
+ BTRFS_DROP_DELAYED_REF, NULL);
+ BUG_ON(ret);
+ }
+
+ if (!last_ref)
+ return;
+
+ block_rsv = get_block_rsv(trans, root);
+ cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+ BUG_ON(block_rsv->space_info != cache->space_info);
+
+ if (btrfs_header_generation(buf) == trans->transid) {
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, root, buf->start);
+ if (!ret)
+ goto pin;
+ }
+
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(root, cache, buf->start, buf->len, 1);
+ goto pin;
+ }
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(cache, buf->start, buf->len);
+ ret = update_reserved_bytes(cache, buf->len, 0, 0);
+ if (ret == -EAGAIN) {
+ /* block group became read-only */
+ update_reserved_bytes(cache, buf->len, 0, 1);
+ goto out;
+ }
+
+ ret = 1;
+ spin_lock(&block_rsv->lock);
+ if (block_rsv->reserved < block_rsv->size) {
+ block_rsv->reserved += buf->len;
+ ret = 0;
+ }
+ spin_unlock(&block_rsv->lock);
+
+ if (ret) {
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_reserved -= buf->len;
+ spin_unlock(&cache->space_info->lock);
+ }
+ goto out;
+ }
+pin:
+ if (block_rsv->durable && !cache->ro) {
+ ret = 0;
+ spin_lock(&cache->lock);
+ if (!cache->ro) {
+ cache->reserved_pinned += buf->len;
+ ret = 1;
+ }
+ spin_unlock(&cache->lock);
+
+ if (ret) {
+ spin_lock(&block_rsv->lock);
+ block_rsv->freed[trans->transid & 0x1] += buf->len;
+ spin_unlock(&block_rsv->lock);
+ }
+ }
+out:
+ btrfs_put_block_group(cache);
+}
+
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4440,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
parent, root_objectid, (int)owner,
BTRFS_DROP_DELAYED_REF, NULL);
BUG_ON(ret);
- ret = check_ref_cleanup(trans, root, bytenr);
- BUG_ON(ret);
} else {
ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
parent, root_objectid, owner,
@@ -4067,21 +4449,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u32 blocksize,
- u64 parent, u64 root_objectid, int level)
-{
- u64 used;
- spin_lock(&root->node_lock);
- used = btrfs_root_used(&root->root_item) - blocksize;
- btrfs_set_root_used(&root->root_item, used);
- spin_unlock(&root->node_lock);
-
- return btrfs_free_extent(trans, root, bytenr, blocksize,
- parent, root_objectid, level, 0);
-}
-
static u64 stripe_align(struct btrfs_root *root, u64 val)
{
u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4501,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
return 0;
}
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+ int index;
+ if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+ index = 0;
+ else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+ index = 1;
+ else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+ index = 2;
+ else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+ index = 3;
+ else
+ index = 4;
+ return index;
+}
+
enum btrfs_loop_type {
LOOP_FIND_IDEAL = 0,
LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4538,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 empty_size,
u64 search_start, u64 search_end,
u64 hint_byte, struct btrfs_key *ins,
- u64 exclude_start, u64 exclude_nr,
int data)
{
int ret = 0;
@@ -4168,6 +4550,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_space_info *space_info;
int last_ptr_loop = 0;
int loop = 0;
+ int index = 0;
bool found_uncached_bg = false;
bool failed_cluster_refill = false;
bool failed_alloc = false;
@@ -4237,6 +4620,7 @@ ideal_cache:
btrfs_put_block_group(block_group);
up_read(&space_info->groups_sem);
} else {
+ index = get_block_group_index(block_group);
goto have_block_group;
}
} else if (block_group) {
@@ -4245,7 +4629,8 @@ ideal_cache:
}
search:
down_read(&space_info->groups_sem);
- list_for_each_entry(block_group, &space_info->block_groups, list) {
+ list_for_each_entry(block_group, &space_info->block_groups[index],
+ list) {
u64 offset;
int cached;
@@ -4436,23 +4821,22 @@ checks:
goto loop;
}
- if (exclude_nr > 0 &&
- (search_start + num_bytes > exclude_start &&
- search_start < exclude_start + exclude_nr)) {
- search_start = exclude_start + exclude_nr;
+ ins->objectid = search_start;
+ ins->offset = num_bytes;
+
+ if (offset < search_start)
+ btrfs_add_free_space(block_group, offset,
+ search_start - offset);
+ BUG_ON(offset > search_start);
+ ret = update_reserved_bytes(block_group, num_bytes, 1,
+ (data & BTRFS_BLOCK_GROUP_DATA));
+ if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
- /*
- * if search_start is still in this block group
- * then we just re-search this block group
- */
- if (search_start >= block_group->key.objectid &&
- search_start < (block_group->key.objectid +
- block_group->key.offset))
- goto have_block_group;
goto loop;
}
+ /* we are all good, lets return */
ins->objectid = search_start;
ins->offset = num_bytes;
@@ -4460,18 +4844,18 @@ checks:
btrfs_add_free_space(block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
-
- update_reserved_extents(block_group, num_bytes, 1);
-
- /* we are all good, lets return */
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
+ BUG_ON(index != get_block_group_index(block_group));
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
+ if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+ goto search;
+
/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
* for them to make caching progress. Also
* determine the best possible bg to cache
@@ -4485,6 +4869,7 @@ loop:
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
(found_uncached_bg || empty_size || empty_cluster ||
allowed_chunk_alloc)) {
+ index = 0;
if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
found_uncached_bg = false;
loop++;
@@ -4567,31 +4952,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
{
struct btrfs_block_group_cache *cache;
+ int index = 0;
spin_lock(&info->lock);
printk(KERN_INFO "space_info has %llu free, is %sfull\n",
(unsigned long long)(info->total_bytes - info->bytes_used -
info->bytes_pinned - info->bytes_reserved -
- info->bytes_super),
+ info->bytes_readonly),
(info->full) ? "" : "not ");
- printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
- " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
- "\n",
+ printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+ "reserved=%llu, may_use=%llu, readonly=%llu\n",
(unsigned long long)info->total_bytes,
+ (unsigned long long)info->bytes_used,
(unsigned long long)info->bytes_pinned,
- (unsigned long long)info->bytes_delalloc,
+ (unsigned long long)info->bytes_reserved,
(unsigned long long)info->bytes_may_use,
- (unsigned long long)info->bytes_used,
- (unsigned long long)info->bytes_root,
- (unsigned long long)info->bytes_super,
- (unsigned long long)info->bytes_reserved);
+ (unsigned long long)info->bytes_readonly);
spin_unlock(&info->lock);
if (!dump_block_groups)
return;
down_read(&info->groups_sem);
- list_for_each_entry(cache, &info->block_groups, list) {
+again:
+ list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
"%llu pinned %llu reserved\n",
@@ -4603,6 +4987,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
}
+ if (++index < BTRFS_NR_RAID_TYPES)
+ goto again;
up_read(&info->groups_sem);
}
@@ -4628,9 +5014,8 @@ again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(trans, root, num_bytes, empty_size,
- search_start, search_end, hint_byte, ins,
- trans->alloc_exclude_start,
- trans->alloc_exclude_nr, data);
+ search_start, search_end, hint_byte,
+ ins, data);
if (ret == -ENOSPC && num_bytes > min_alloc_size) {
num_bytes = num_bytes >> 1;
@@ -4668,7 +5053,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
- update_reserved_extents(cache, len, 0);
+ update_reserved_bytes(cache, len, 0, 1);
btrfs_put_block_group(cache);
return ret;
@@ -4731,8 +5116,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset,
- 1, 0);
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) {
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5176,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = update_block_group(trans, root, ins->objectid, ins->offset,
- 1, 0);
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) {
printk(KERN_ERR "btrfs update block group failed for %llu "
"%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5252,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
put_caching_control(caching_ctl);
}
- update_reserved_extents(block_group, ins->offset, 1);
+ ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+ BUG_ON(ret);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
return ret;
}
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-static int alloc_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 parent, u64 root_objectid,
- struct btrfs_disk_key *key, int level,
- u64 empty_size, u64 hint_byte, u64 search_end,
- struct btrfs_key *ins)
-{
- int ret;
- u64 flags = 0;
-
- ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
- empty_size, hint_byte, search_end,
- ins, 0);
- if (ret)
- return ret;
-
- if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
- if (parent == 0)
- parent = ins->objectid;
- flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
- } else
- BUG_ON(parent > 0);
-
- if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- struct btrfs_delayed_extent_op *extent_op;
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
- if (key)
- memcpy(&extent_op->key, key, sizeof(extent_op->key));
- else
- memset(&extent_op->key, 0, sizeof(extent_op->key));
- extent_op->flags_to_set = flags;
- extent_op->update_key = 1;
- extent_op->update_flags = 1;
- extent_op->is_data = 0;
-
- ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
- ins->offset, parent, root_objectid,
- level, BTRFS_ADD_DELAYED_EXTENT,
- extent_op);
- BUG_ON(ret);
- }
-
- if (root_objectid == root->root_key.objectid) {
- u64 used;
- spin_lock(&root->node_lock);
- used = btrfs_root_used(&root->root_item) + num_bytes;
- btrfs_set_root_used(&root->root_item, used);
- spin_unlock(&root->node_lock);
- }
- return ret;
-}
-
struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u32 blocksize,
@@ -4974,8 +5298,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
return buf;
}
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u32 blocksize)
+{
+ struct btrfs_block_rsv *block_rsv;
+ int ret;
+
+ block_rsv = get_block_rsv(trans, root);
+
+ if (block_rsv->size == 0) {
+ ret = reserve_metadata_bytes(block_rsv, blocksize);
+ if (ret)
+ return ERR_PTR(ret);
+ return block_rsv;
+ }
+
+ ret = block_rsv_use_bytes(block_rsv, blocksize);
+ if (!ret)
+ return block_rsv;
+
+ WARN_ON(1);
+ printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+ block_rsv->size, block_rsv->reserved,
+ block_rsv->freed[0], block_rsv->freed[1]);
+
+ return ERR_PTR(-ENOSPC);
+}
+
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+ block_rsv_add_bytes(block_rsv, blocksize, 0);
+ block_rsv_release_bytes(block_rsv, NULL, 0);
+}
+
/*
- * helper function to allocate a block for a given tree
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
* returns the tree buffer or NULL.
*/
struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5346,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
u64 hint, u64 empty_size)
{
struct btrfs_key ins;
- int ret;
+ struct btrfs_block_rsv *block_rsv;
struct extent_buffer *buf;
+ u64 flags = 0;
+ int ret;
+
- ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
- key, level, empty_size, hint, (u64)-1, &ins);
+ block_rsv = use_block_rsv(trans, root, blocksize);
+ if (IS_ERR(block_rsv))
+ return ERR_CAST(block_rsv);
+
+ ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+ empty_size, hint, (u64)-1, &ins, 0);
if (ret) {
- BUG_ON(ret > 0);
+ unuse_block_rsv(block_rsv, blocksize);
return ERR_PTR(ret);
}
buf = btrfs_init_new_buffer(trans, root, ins.objectid,
blocksize, level);
+ BUG_ON(IS_ERR(buf));
+
+ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (parent == 0)
+ parent = ins.objectid;
+ flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ } else
+ BUG_ON(parent > 0);
+
+ if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+ struct btrfs_delayed_extent_op *extent_op;
+ extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+ BUG_ON(!extent_op);
+ if (key)
+ memcpy(&extent_op->key, key, sizeof(extent_op->key));
+ else
+ memset(&extent_op->key, 0, sizeof(extent_op->key));
+ extent_op->flags_to_set = flags;
+ extent_op->update_key = 1;
+ extent_op->update_flags = 1;
+ extent_op->is_data = 0;
+
+ ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+ ins.offset, parent, root_objectid,
+ level, BTRFS_ADD_DELAYED_EXTENT,
+ extent_op);
+ BUG_ON(ret);
+ }
return buf;
}
@@ -5321,7 +5717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct walk_control *wc)
{
- int ret = 0;
+ int ret;
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
u64 parent = 0;
@@ -5399,13 +5795,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
btrfs_header_owner(path->nodes[level + 1]));
}
- ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
- root->root_key.objectid, level, 0);
- BUG_ON(ret);
+ btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
out:
wc->refs[level] = 0;
wc->flags[level] = 0;
- return ret;
+ return 0;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5877,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
* also make sure backrefs for the shared block and all lower level
* blocks are properly updated.
*/
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv, int update_ref)
{
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
@@ -5501,7 +5896,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
wc = kzalloc(sizeof(*wc), GFP_NOFS);
BUG_ON(!wc);
- trans = btrfs_start_transaction(tree_root, 1);
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (block_rsv)
+ trans->block_rsv = block_rsv;
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
@@ -5589,22 +5986,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
}
BUG_ON(wc->level == 0);
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing) {
+ if (btrfs_should_end_transaction(trans, tree_root)) {
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
root_item);
BUG_ON(ret);
- btrfs_end_transaction(trans, tree_root);
- trans = btrfs_start_transaction(tree_root, 1);
- } else {
- unsigned long update;
- update = trans->delayed_ref_updates;
- trans->delayed_ref_updates = 0;
- if (update)
- btrfs_run_delayed_refs(trans, tree_root,
- update);
+ btrfs_end_transaction_throttle(trans, tree_root);
+ trans = btrfs_start_transaction(tree_root, 0);
+ if (block_rsv)
+ trans->block_rsv = block_rsv;
}
}
btrfs_release_path(root, path);
@@ -5632,7 +6023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
kfree(root);
}
out:
- btrfs_end_transaction(trans, tree_root);
+ btrfs_end_transaction_throttle(trans, tree_root);
kfree(wc);
btrfs_free_path(path);
return err;
@@ -7228,48 +7619,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
return flags;
}
-static int __alloc_chunk_for_shrink(struct btrfs_root *root,
- struct btrfs_block_group_cache *shrink_block_group,
- int force)
+static int set_block_group_ro(struct btrfs_block_group_cache *cache)
{
- struct btrfs_trans_handle *trans;
- u64 new_alloc_flags;
- u64 calc;
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+ int ret = -ENOSPC;
- spin_lock(&shrink_block_group->lock);
- if (btrfs_block_group_used(&shrink_block_group->item) +
- shrink_block_group->reserved > 0) {
- spin_unlock(&shrink_block_group->lock);
+ if (cache->ro)
+ return 0;
- trans = btrfs_start_transaction(root, 1);
- spin_lock(&shrink_block_group->lock);
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+ num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ cache->bytes_super - btrfs_block_group_used(&cache->item);
+
+ if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+ sinfo->bytes_may_use + sinfo->bytes_readonly +
+ cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+ sinfo->bytes_readonly += num_bytes;
+ sinfo->bytes_reserved += cache->reserved_pinned;
+ cache->reserved_pinned = 0;
+ cache->ro = 1;
+ ret = 0;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
+ return ret;
+}
- new_alloc_flags = update_block_group_flags(root,
- shrink_block_group->flags);
- if (new_alloc_flags != shrink_block_group->flags) {
- calc =
- btrfs_block_group_used(&shrink_block_group->item);
- } else {
- calc = shrink_block_group->key.offset;
- }
- spin_unlock(&shrink_block_group->lock);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
- do_chunk_alloc(trans, root->fs_info->extent_root,
- calc + 2 * 1024 * 1024, new_alloc_flags, force);
+{
+ struct btrfs_trans_handle *trans;
+ u64 alloc_flags;
+ int ret;
- btrfs_end_transaction(trans, root);
- } else
- spin_unlock(&shrink_block_group->lock);
- return 0;
-}
+ BUG_ON(cache->ro);
+
+ trans = btrfs_join_transaction(root, 1);
+ BUG_ON(IS_ERR(trans));
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ if (alloc_flags != cache->flags)
+ do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
- struct btrfs_block_group_cache *group)
+ ret = set_block_group_ro(cache);
+ if (!ret)
+ goto out;
+ alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+ ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+ if (ret < 0)
+ goto out;
+ ret = set_block_group_ro(cache);
+out:
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
{
- __alloc_chunk_for_shrink(root, group, 1);
- set_block_group_readonly(group);
+ struct btrfs_space_info *sinfo = cache->space_info;
+ u64 num_bytes;
+
+ BUG_ON(!cache->ro);
+
+ spin_lock(&sinfo->lock);
+ spin_lock(&cache->lock);
+ num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+ cache->bytes_super - btrfs_block_group_used(&cache->item);
+ sinfo->bytes_readonly -= num_bytes;
+ cache->ro = 0;
+ spin_unlock(&cache->lock);
+ spin_unlock(&sinfo->lock);
return 0;
}
@@ -7436,17 +7859,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
*/
synchronize_rcu();
+ release_global_block_rsv(info);
+
while(!list_empty(&info->space_info)) {
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
-
+ if (space_info->bytes_pinned > 0 ||
+ space_info->bytes_reserved > 0) {
+ WARN_ON(1);
+ dump_space_info(space_info, 0, 0);
+ }
list_del(&space_info->list);
kfree(space_info);
}
return 0;
}
+static void __link_block_group(struct btrfs_space_info *space_info,
+ struct btrfs_block_group_cache *cache)
+{
+ int index = get_block_group_index(cache);
+
+ down_write(&space_info->groups_sem);
+ list_add_tail(&cache->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
+}
+
int btrfs_read_block_groups(struct btrfs_root *root)
{
struct btrfs_path *path;
@@ -7468,10 +7907,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
while (1) {
ret = find_first_block_group(root, path, &key);
- if (ret > 0) {
- ret = 0;
- goto error;
- }
+ if (ret > 0)
+ break;
if (ret != 0)
goto error;
@@ -7480,7 +7917,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache) {
ret = -ENOMEM;
- break;
+ goto error;
}
atomic_set(&cache->count, 1);
@@ -7537,20 +7974,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
BUG_ON(ret);
cache->space_info = space_info;
spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_super += cache->bytes_super;
+ cache->space_info->bytes_readonly += cache->bytes_super;
spin_unlock(&cache->space_info->lock);
- down_write(&space_info->groups_sem);
- list_add_tail(&cache->list, &space_info->block_groups);
- up_write(&space_info->groups_sem);
+ __link_block_group(space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
BUG_ON(ret);
set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid))
- set_block_group_readonly(cache);
+ set_block_group_ro(cache);
}
+
+ list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+ if (!(get_alloc_profile(root, space_info->flags) &
+ (BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_DUP)))
+ continue;
+ /*
+ * avoid allocating from un-mirrored block group if there are
+ * mirrored block groups.
+ */
+ list_for_each_entry(cache, &space_info->block_groups[3], list)
+ set_block_group_ro(cache);
+ list_for_each_entry(cache, &space_info->block_groups[4], list)
+ set_block_group_ro(cache);
+ }
+
+ init_global_block_rsv(info);
ret = 0;
error:
btrfs_free_path(path);
@@ -7611,12 +8064,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
BUG_ON(ret);
spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_super += cache->bytes_super;
+ cache->space_info->bytes_readonly += cache->bytes_super;
spin_unlock(&cache->space_info->lock);
- down_write(&cache->space_info->groups_sem);
- list_add_tail(&cache->list, &cache->space_info->block_groups);
- up_write(&cache->space_info->groups_sem);
+ __link_block_group(cache->space_info, cache);
ret = btrfs_add_block_group_cache(root->fs_info, cache);
BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..a4080c21ec55 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
return state;
}
-static void free_extent_state(struct extent_state *state)
+void free_extent_state(struct extent_state *state)
{
if (!state)
return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
}
static int set_state_cb(struct extent_io_tree *tree,
- struct extent_state *state,
- unsigned long bits)
+ struct extent_state *state, int *bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
return tree->ops->set_bit_hook(tree->mapping->host,
- state->start, state->end,
- state->state, bits);
+ state, bits);
}
return 0;
}
static void clear_state_cb(struct extent_io_tree *tree,
- struct extent_state *state,
- unsigned long bits)
+ struct extent_state *state, int *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
*/
static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
- int bits)
+ int *bits)
{
struct rb_node *node;
+ int bits_to_set = *bits & ~EXTENT_CTLBITS;
int ret;
if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
if (ret)
return ret;
- if (bits & EXTENT_DIRTY)
+ if (bits_to_set & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
- state->state |= bits;
+ state->state |= bits_to_set;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
* struct is freed and removed from the tree
*/
static int clear_state_bit(struct extent_io_tree *tree,
- struct extent_state *state, int bits, int wake,
- int delete)
+ struct extent_state *state,
+ int *bits, int wake)
{
- int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+ int bits_to_clear = *bits & ~EXTENT_CTLBITS;
int ret = state->state & bits_to_clear;
- if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+ if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
WARN_ON(range > tree->dirty_bytes);
tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
- if (delete || state->state == 0) {
+ if (state->state == 0) {
if (state->tree) {
- clear_state_cb(tree, state, state->state);
rb_erase(&state->rb_node, &tree->state);
state->tree = NULL;
free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int set = 0;
int clear = 0;
+ if (delete)
+ bits |= ~EXTENT_CTLBITS;
+ bits |= EXTENT_FIRST_DELALLOC;
+
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
again:
@@ -580,8 +581,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, bits, wake,
- delete);
+ set |= clear_state_bit(tree, state, &bits, wake);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+ set |= clear_state_bit(tree, prealloc, &bits, wake);
prealloc = NULL;
goto out;
@@ -613,7 +613,7 @@ hit_next:
else
next_node = NULL;
- set |= clear_state_bit(tree, state, bits, wake, delete);
+ set |= clear_state_bit(tree, state, &bits, wake);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -706,19 +706,19 @@ out:
static int set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- int bits)
+ int *bits)
{
int ret;
+ int bits_to_set = *bits & ~EXTENT_CTLBITS;
ret = set_state_cb(tree, state, bits);
if (ret)
return ret;
-
- if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
+ if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
- state->state |= bits;
+ state->state |= bits_to_set;
return 0;
}
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
* [start, end] is inclusive This takes the tree lock.
*/
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive_bits, u64 *failed_start,
- struct extent_state **cached_state,
- gfp_t mask)
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u64 last_start;
u64 last_end;
+ bits |= EXTENT_FIRST_DELALLOC;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
*/
node = tree_search(tree, start);
if (!node) {
- err = insert_state(tree, prealloc, start, end, bits);
+ err = insert_state(tree, prealloc, start, end, &bits);
prealloc = NULL;
BUG_ON(err == -EEXIST);
goto out;
@@ -802,7 +802,7 @@ hit_next:
goto out;
}
- err = set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, &bits);
if (err)
goto out;
@@ -852,7 +852,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- err = set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, &bits);
if (err)
goto out;
cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
else
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
- bits);
+ &bits);
BUG_ON(err == -EEXIST);
if (err) {
prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- err = set_state_bits(tree, prealloc, bits);
+ err = set_state_bits(tree, prealloc, &bits);
if (err) {
prealloc = NULL;
goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, 0, 0,
- NULL, mask);
+ EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
}
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
if (op & EXTENT_CLEAR_DELALLOC)
clear_bits |= EXTENT_DELALLOC;
- if (op & EXTENT_CLEAR_ACCOUNTING)
- clear_bits |= EXTENT_DO_ACCOUNTING;
-
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
if (tree->ops && tree->ops->submit_bio_hook)
tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
- mirror_num, bio_flags);
+ mirror_num, bio_flags, start);
else
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
sector_t sector;
struct extent_map *em;
struct block_device *bdev;
+ struct btrfs_ordered_extent *ordered;
int ret;
int nr = 0;
size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
set_page_extent_mapped(page);
end = page_end;
- lock_extent(tree, start, end, GFP_NOFS);
+ while (1) {
+ lock_extent(tree, start, end, GFP_NOFS);
+ ordered = btrfs_lookup_ordered_extent(inode, start);
+ if (!ordered)
+ break;
+ unlock_extent(tree, start, end, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ }
if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
char *userpage;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_FIRST_DELALLOC (1 << 12)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/* flags for bio submission */
#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
- unsigned long bio_flags);
+ unsigned long bio_flags, u64 bio_offset);
struct extent_io_ops {
int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
struct extent_state *state);
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
- int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits);
+ int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
+ int *bits);
int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
- unsigned long bits);
+ int *bits);
int (*merge_extent_hook)(struct inode *inode,
struct extent_state *new,
struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
u64 max_bytes, unsigned long bits);
+void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int filled, struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
}
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
- struct bio *bio, u32 *dst)
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
+ struct inode *inode, struct bio *bio,
+ u64 logical_offset, u32 *dst, int dio)
{
u32 sum;
struct bio_vec *bvec = bio->bi_io_vec;
int bio_index = 0;
- u64 offset;
+ u64 offset = 0;
u64 item_start_offset = 0;
u64 item_last_offset = 0;
u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
WARN_ON(bio->bi_vcnt <= 0);
disk_bytenr = (u64)bio->bi_sector << 9;
+ if (dio)
+ offset = logical_offset;
while (bio_index < bio->bi_vcnt) {
- offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+ if (!dio)
+ offset = page_offset(bvec->bv_page) + bvec->bv_offset;
ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
if (ret == 0)
goto found;
@@ -238,6 +242,7 @@ found:
else
set_state_private(io_tree, offset, sum);
disk_bytenr += bvec->bv_len;
+ offset += bvec->bv_len;
bio_index++;
bvec++;
}
@@ -245,6 +250,18 @@ found:
return 0;
}
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u32 *dst)
+{
+ return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio, u64 offset, u32 *dst)
+{
+ return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
+
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list)
{
@@ -657,6 +674,9 @@ again:
goto found;
}
ret = PTR_ERR(item);
+ if (ret != -EFBIG && ret != -ENOENT)
+ goto fail_unlock;
+
if (ret == -EFBIG) {
u32 item_size;
/* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..79437c5eeb1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
int write_bytes,
struct page **prepared_pages,
- const char __user *buf)
+ struct iov_iter *i)
{
- long page_fault = 0;
- int i;
+ size_t copied;
+ int pg = 0;
int offset = pos & (PAGE_CACHE_SIZE - 1);
- for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+ while (write_bytes > 0) {
size_t count = min_t(size_t,
PAGE_CACHE_SIZE - offset, write_bytes);
- struct page *page = prepared_pages[i];
- fault_in_pages_readable(buf, count);
+ struct page *page = prepared_pages[pg];
+again:
+ if (unlikely(iov_iter_fault_in_readable(i, count)))
+ return -EFAULT;
/* Copy data from userspace to the current page */
- kmap(page);
- page_fault = __copy_from_user(page_address(page) + offset,
- buf, count);
+ copied = iov_iter_copy_from_user(page, i, offset, count);
+
/* Flush processor's dcache for this page */
flush_dcache_page(page);
- kunmap(page);
- buf += count;
- write_bytes -= count;
+ iov_iter_advance(i, copied);
+ write_bytes -= copied;
- if (page_fault)
- break;
+ if (unlikely(copied == 0)) {
+ count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+ iov_iter_single_seg_count(i));
+ goto again;
+ }
+
+ if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+ offset += copied;
+ } else {
+ pg++;
+ offset = 0;
+ }
}
- return page_fault ? -EFAULT : 0;
+ return 0;
}
/*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
end_of_last_block = start_pos + num_bytes - 1;
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
NULL);
- if (err)
- return err;
+ BUG_ON(err);
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
* at this time.
*/
}
- return err;
+ return 0;
}
/*
@@ -823,45 +832,46 @@ again:
return 0;
}
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+ const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- loff_t pos;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = fdentry(file)->d_inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct page *pinned[2];
+ struct page **pages = NULL;
+ struct iov_iter i;
+ loff_t *ppos = &iocb->ki_pos;
loff_t start_pos;
ssize_t num_written = 0;
ssize_t err = 0;
+ size_t count;
+ size_t ocount;
int ret = 0;
- struct inode *inode = fdentry(file)->d_inode;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct page **pages = NULL;
int nrptrs;
- struct page *pinned[2];
unsigned long first_index;
unsigned long last_index;
int will_write;
+ int buffered = 0;
will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
(file->f_flags & O_DIRECT));
- nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
- PAGE_CACHE_SIZE / (sizeof(struct page *)));
pinned[0] = NULL;
pinned[1] = NULL;
- pos = *ppos;
start_pos = pos;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
- /* do the reserve before the mutex lock in case we have to do some
- * flushing. We wouldn't deadlock, but this is more polite.
- */
- err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- if (err)
- goto out_nolock;
-
mutex_lock(&inode->i_mutex);
+ err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (err)
+ goto out;
+ count = ocount;
+
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
goto out;
file_update_time(file);
+ BTRFS_I(inode)->sequence++;
+
+ if (unlikely(file->f_flags & O_DIRECT)) {
+ num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+ pos, ppos, count,
+ ocount);
+ /*
+ * the generic O_DIRECT will update in-memory i_size after the
+ * DIOs are done. But our endio handlers that update the on
+ * disk i_size never update past the in memory i_size. So we
+ * need one more update here to catch any additions to the
+ * file
+ */
+ if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+ btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+ mark_inode_dirty(inode);
+ }
+ if (num_written < 0) {
+ ret = num_written;
+ num_written = 0;
+ goto out;
+ } else if (num_written == count) {
+ /* pick up pos changes done by the generic code */
+ pos = *ppos;
+ goto out;
+ }
+ /*
+ * We are going to do buffered for the rest of the range, so we
+ * need to make sure to invalidate the buffered pages when we're
+ * done.
+ */
+ buffered = 1;
+ pos += num_written;
+ }
+
+ iov_iter_init(&i, iov, nr_segs, count, num_written);
+ nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+ PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+ (sizeof(struct page *)));
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
/* generic_write_checks can change our pos */
start_pos = pos;
- BTRFS_I(inode)->sequence++;
first_index = pos >> PAGE_CACHE_SHIFT;
- last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+ last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
/*
* there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
unlock_page(pinned[0]);
}
}
- if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+ if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
pinned[1] = grab_cache_page(inode->i_mapping, last_index);
if (!PageUptodate(pinned[1])) {
ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
}
}
- while (count > 0) {
+ while (iov_iter_count(&i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
- size_t write_bytes = min(count, nrptrs *
- (size_t)PAGE_CACHE_SIZE -
+ size_t write_bytes = min(iov_iter_count(&i),
+ nrptrs * (size_t)PAGE_CACHE_SIZE -
offset);
size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
WARN_ON(num_pages > nrptrs);
memset(pages, 0, sizeof(struct page *) * nrptrs);
- ret = btrfs_check_data_free_space(root, inode, write_bytes);
+ ret = btrfs_delalloc_reserve_space(inode, write_bytes);
if (ret)
goto out;
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
pos, first_index, last_index,
write_bytes);
if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- write_bytes);
+ btrfs_delalloc_release_space(inode, write_bytes);
goto out;
}
ret = btrfs_copy_from_user(pos, num_pages,
- write_bytes, pages, buf);
- if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- write_bytes);
- btrfs_drop_pages(pages, num_pages);
- goto out;
+ write_bytes, pages, &i);
+ if (ret == 0) {
+ dirty_and_release_pages(NULL, root, file, pages,
+ num_pages, pos, write_bytes);
}
- ret = dirty_and_release_pages(NULL, root, file, pages,
- num_pages, pos, write_bytes);
btrfs_drop_pages(pages, num_pages);
if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- write_bytes);
+ btrfs_delalloc_release_space(inode, write_bytes);
goto out;
}
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
btrfs_throttle(root);
}
- buf += write_bytes;
- count -= write_bytes;
pos += write_bytes;
num_written += write_bytes;
@@ -976,9 +1016,7 @@ out:
mutex_unlock(&inode->i_mutex);
if (ret)
err = ret;
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-out_nolock:
kfree(pages);
if (pinned[0])
page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
num_written = err;
if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
ret = btrfs_log_dentry_safe(trans, root,
file->f_dentry);
if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
btrfs_end_transaction(trans, root);
}
}
- if (file->f_flags & O_DIRECT) {
+ if (file->f_flags & O_DIRECT && buffered) {
invalidate_mapping_pages(inode->i_mapping,
start_pos >> PAGE_CACHE_SHIFT,
(start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1104,9 +1142,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (file && file->private_data)
btrfs_ioctl_trans_end(file);
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto out;
}
@@ -1161,7 +1199,7 @@ const struct file_operations btrfs_file_operations = {
.read = do_sync_read,
.aio_read = generic_file_aio_read,
.splice_read = generic_file_splice_read,
- .write = btrfs_file_write,
+ .aio_write = btrfs_file_aio_write,
.mmap = btrfs_file_mmap,
.open = generic_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
return 0;
}
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len,
+ u64 inode_objectid, u64 ref_objectid, int mod)
+{
+ struct btrfs_key key;
+ struct btrfs_inode_ref *ref;
+ int ins_len = mod < 0 ? -1 : 0;
+ int cow = mod != 0;
+ int ret;
+
+ key.objectid = inode_objectid;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
+
+ ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return NULL;
+ if (!find_name_in_backref(path, name, name_len, &ref))
+ return NULL;
+ return ref;
+}
+
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d601629b85d1..fa6ccc1bfe2a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
inline_len, compressed_size,
compressed_pages);
BUG_ON(ret);
+ btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
}
@@ -414,6 +415,7 @@ again:
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
/* lets try to make an inline extent */
if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
start, end, NULL,
EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
EXTENT_CLEAR_DELALLOC |
- EXTENT_CLEAR_ACCOUNTING |
EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
return 0;
}
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+ u64 num_bytes)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ u64 alloc_hint = 0;
+
+ read_lock(&em_tree->lock);
+ em = search_extent_mapping(em_tree, start, num_bytes);
+ if (em) {
+ /*
+ * if block start isn't an actual block number then find the
+ * first block in this inode and use that as a hint. If that
+ * block is also bogus then just don't worry about it.
+ */
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ free_extent_map(em);
+ em = search_extent_mapping(em_tree, 0, 0);
+ if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+ alloc_hint = em->block_start;
+ if (em)
+ free_extent_map(em);
+ } else {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
+ }
+ read_unlock(&em_tree->lock);
+
+ return alloc_hint;
+}
+
/*
* when extent_io.c finds a delayed allocation range in the file,
* the call backs end up in this code. The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
actual_end = min_t(u64, isize, end + 1);
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
EXTENT_CLEAR_UNLOCK_PAGE |
EXTENT_CLEAR_UNLOCK |
EXTENT_CLEAR_DELALLOC |
- EXTENT_CLEAR_ACCOUNTING |
EXTENT_CLEAR_DIRTY |
EXTENT_SET_WRITEBACK |
EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(disk_num_bytes >
btrfs_super_total_bytes(&root->fs_info->super_copy));
-
- read_lock(&BTRFS_I(inode)->extent_tree.lock);
- em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
- start, num_bytes);
- if (em) {
- /*
- * if block start isn't an actual block number then find the
- * first block in this inode and use that as a hint. If that
- * block is also bogus then just don't worry about it.
- */
- if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- em = search_extent_mapping(em_tree, 0, 0);
- if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
- if (em)
- free_extent_map(em);
- } else {
- alloc_hint = em->block_start;
- free_extent_map(em);
- }
- }
- read_unlock(&BTRFS_I(inode)->extent_tree.lock);
+ alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
num_bytes, num_bytes, type);
BUG_ON(ret);
+ if (root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ ret = btrfs_reloc_clone_csums(inode, cur_offset,
+ num_bytes);
+ BUG_ON(ret);
+ }
+
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
cur_offset, cur_offset + num_bytes - 1,
locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
}
static int btrfs_split_extent_hook(struct inode *inode,
- struct extent_state *orig, u64 split)
+ struct extent_state *orig, u64 split)
{
+ /* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return 0;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
return 0;
}
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
if (!(other->state & EXTENT_DELALLOC))
return 0;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->outstanding_extents--;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
+ atomic_dec(&BTRFS_I(inode)->outstanding_extents);
return 0;
}
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
* bytes in this file, and to maintain the list of inodes that
* have pending delalloc work to be done.
*/
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits)
+static int btrfs_set_bit_hook(struct inode *inode,
+ struct extent_state *state, int *bits)
{
/*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 len = state->end + 1 - state->start;
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+ if (*bits & EXTENT_FIRST_DELALLOC)
+ *bits &= ~EXTENT_FIRST_DELALLOC;
+ else
+ atomic_inc(&BTRFS_I(inode)->outstanding_extents);
spin_lock(&root->fs_info->delalloc_lock);
- BTRFS_I(inode)->delalloc_bytes += end - start + 1;
- root->fs_info->delalloc_bytes += end - start + 1;
+ BTRFS_I(inode)->delalloc_bytes += len;
+ root->fs_info->delalloc_bytes += len;
if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
&root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
static int btrfs_clear_bit_hook(struct inode *inode,
- struct extent_state *state, unsigned long bits)
+ struct extent_state *state, int *bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 len = state->end + 1 - state->start;
- if (bits & EXTENT_DO_ACCOUNTING) {
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- WARN_ON(!BTRFS_I(inode)->outstanding_extents);
- BTRFS_I(inode)->outstanding_extents--;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
- }
+ if (*bits & EXTENT_FIRST_DELALLOC)
+ *bits &= ~EXTENT_FIRST_DELALLOC;
+ else if (!(*bits & EXTENT_DO_ACCOUNTING))
+ atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+
+ if (*bits & EXTENT_DO_ACCOUNTING)
+ btrfs_delalloc_release_metadata(inode, len);
+
+ if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ btrfs_free_reserved_data_space(inode, len);
spin_lock(&root->fs_info->delalloc_lock);
- if (state->end - state->start + 1 >
- root->fs_info->delalloc_bytes) {
- printk(KERN_INFO "btrfs warning: delalloc account "
- "%llu %llu\n",
- (unsigned long long)
- state->end - state->start + 1,
- (unsigned long long)
- root->fs_info->delalloc_bytes);
- btrfs_delalloc_free_space(root, inode, (u64)-1);
- root->fs_info->delalloc_bytes = 0;
- BTRFS_I(inode)->delalloc_bytes = 0;
- } else {
- btrfs_delalloc_free_space(root, inode,
- state->end -
- state->start + 1);
- root->fs_info->delalloc_bytes -= state->end -
- state->start + 1;
- BTRFS_I(inode)->delalloc_bytes -= state->end -
- state->start + 1;
- }
+ root->fs_info->delalloc_bytes -= len;
+ BTRFS_I(inode)->delalloc_bytes -= len;
+
if (BTRFS_I(inode)->delalloc_bytes == 0 &&
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
*/
static int __btrfs_submit_bio_start(struct inode *inode, int rw,
struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+ unsigned long bio_flags,
+ u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
* are inserted into the btree
*/
static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
* on write, or reading the csums from the tree before a read
*/
static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
- int mirror_num, unsigned long bio_flags)
+ int mirror_num, unsigned long bio_flags,
+ u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
/* we're doing a write, do the async checksumming */
return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
inode, rw, bio, mirror_num,
- bio_flags, __btrfs_submit_bio_start,
+ bio_flags, bio_offset,
+ __btrfs_submit_bio_start,
__btrfs_submit_bio_done);
}
@@ -1520,6 +1525,7 @@ again:
goto again;
}
+ BUG();
btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
ClearPageChecked(page);
out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (!ret) {
trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
- btrfs_end_transaction(trans, root);
}
goto out;
}
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
0, &cached_state, GFP_NOFS);
trans = btrfs_join_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
add_pending_csums(trans, inode, ordered_extent->file_offset,
&ordered_extent->list);
- /* this also removes the ordered extent from the tree */
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
- btrfs_end_transaction(trans, root);
out:
+ btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+ if (trans)
+ btrfs_end_transaction(trans, root);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
/* once for the tree */
@@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
failrec->last_mirror,
- failrec->bio_flags);
+ failrec->bio_flags, 0);
return 0;
}
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
}
/*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve)
+{
+ struct btrfs_root *root;
+ struct btrfs_block_rsv *block_rsv;
+ u64 num_bytes;
+ int index;
+
+ root = pending->root;
+ if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+ return;
+
+ block_rsv = root->orphan_block_rsv;
+
+ /* orphan block reservation for the snapshot */
+ num_bytes = block_rsv->size;
+
+ /*
+ * after the snapshot is created, COWing tree blocks may use more
+ * space than it frees. So we should make sure there is enough
+ * reserved space.
+ */
+ index = trans->transid & 0x1;
+ if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+ num_bytes += block_rsv->size -
+ (block_rsv->reserved + block_rsv->freed[index]);
+ }
+
+ *bytes_to_reserve += num_bytes;
+}
+
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *snap = pending->snap;
+ struct btrfs_block_rsv *block_rsv;
+ u64 num_bytes;
+ int index;
+ int ret;
+
+ if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+ return;
+
+ /* refill source subvolume's orphan block reservation */
+ block_rsv = root->orphan_block_rsv;
+ index = trans->transid & 0x1;
+ if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+ num_bytes = block_rsv->size -
+ (block_rsv->reserved + block_rsv->freed[index]);
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ root->orphan_block_rsv,
+ num_bytes);
+ BUG_ON(ret);
+ }
+
+ /* setup orphan block reservation for the snapshot */
+ block_rsv = btrfs_alloc_block_rsv(snap);
+ BUG_ON(!block_rsv);
+
+ btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+ snap->orphan_block_rsv = block_rsv;
+
+ num_bytes = root->orphan_block_rsv->size;
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ block_rsv, num_bytes);
+ BUG_ON(ret);
+
+#if 0
+ /* insert orphan item for the snapshot */
+ WARN_ON(!root->orphan_item_inserted);
+ ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ snap->root_key.objectid);
+ BUG_ON(ret);
+ snap->orphan_item_inserted = 1;
+#endif
+}
+
+enum btrfs_orphan_cleanup_state {
+ ORPHAN_CLEANUP_STARTED = 1,
+ ORPHAN_CLEANUP_DONE = 2,
+};
+
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+
+ if (!list_empty(&root->orphan_list) ||
+ root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+ return;
+
+ if (root->orphan_item_inserted &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
+ root->orphan_item_inserted = 0;
+ }
+
+ if (root->orphan_block_rsv) {
+ WARN_ON(root->orphan_block_rsv->size > 0);
+ btrfs_free_block_rsv(root, root->orphan_block_rsv);
+ root->orphan_block_rsv = NULL;
+ }
+}
+
+/*
* This creates an orphan entry for the given inode in case something goes
* wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ * this function.
*/
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
+ struct btrfs_block_rsv *block_rsv = NULL;
+ int reserve = 0;
+ int insert = 0;
+ int ret;
- spin_lock(&root->list_lock);
+ if (!root->orphan_block_rsv) {
+ block_rsv = btrfs_alloc_block_rsv(root);
+ BUG_ON(!block_rsv);
+ }
- /* already on the orphan list, we're good */
- if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
- spin_unlock(&root->list_lock);
- return 0;
+ spin_lock(&root->orphan_lock);
+ if (!root->orphan_block_rsv) {
+ root->orphan_block_rsv = block_rsv;
+ } else if (block_rsv) {
+ btrfs_free_block_rsv(root, block_rsv);
+ block_rsv = NULL;
+ }
+
+ if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+ list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+ /*
+ * For proper ENOSPC handling, we should do orphan
+ * cleanup when mounting. But this introduces backward
+ * compatibility issue.
+ */
+ if (!xchg(&root->orphan_item_inserted, 1))
+ insert = 2;
+ else
+ insert = 1;
+#endif
+ insert = 1;
+ } else {
+ WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
}
- list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+ if (!BTRFS_I(inode)->orphan_meta_reserved) {
+ BTRFS_I(inode)->orphan_meta_reserved = 1;
+ reserve = 1;
+ }
+ spin_unlock(&root->orphan_lock);
- spin_unlock(&root->list_lock);
+ if (block_rsv)
+ btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
- /*
- * insert an orphan item to track this unlinked/truncated file
- */
- ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+ /* grab metadata reservation from transaction handle */
+ if (reserve) {
+ ret = btrfs_orphan_reserve_metadata(trans, inode);
+ BUG_ON(ret);
+ }
- return ret;
+ /* insert an orphan item to track this unlinked/truncated file */
+ if (insert >= 1) {
+ ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+ BUG_ON(ret);
+ }
+
+ /* insert an orphan item to track subvolume contains orphan files */
+ if (insert >= 2) {
+ ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
+ }
+ return 0;
}
/*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ int delete_item = 0;
+ int release_rsv = 0;
int ret = 0;
- spin_lock(&root->list_lock);
-
- if (list_empty(&BTRFS_I(inode)->i_orphan)) {
- spin_unlock(&root->list_lock);
- return 0;
+ spin_lock(&root->orphan_lock);
+ if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+ list_del_init(&BTRFS_I(inode)->i_orphan);
+ delete_item = 1;
}
- list_del_init(&BTRFS_I(inode)->i_orphan);
- if (!trans) {
- spin_unlock(&root->list_lock);
- return 0;
+ if (BTRFS_I(inode)->orphan_meta_reserved) {
+ BTRFS_I(inode)->orphan_meta_reserved = 0;
+ release_rsv = 1;
}
+ spin_unlock(&root->orphan_lock);
- spin_unlock(&root->list_lock);
+ if (trans && delete_item) {
+ ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+ BUG_ON(ret);
+ }
- ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+ if (release_rsv)
+ btrfs_orphan_release_metadata(inode);
- return ret;
+ return 0;
}
/*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
struct inode *inode;
int ret = 0, nr_unlink = 0, nr_truncate = 0;
- if (!xchg(&root->clean_orphans, 0))
+ if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
return;
path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.type = BTRFS_INODE_ITEM_KEY;
found_key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
- if (IS_ERR(inode))
- break;
+ BUG_ON(IS_ERR(inode));
/*
* add this inode to the orphan list so btrfs_orphan_del does
* the proper thing when we hit it
*/
- spin_lock(&root->list_lock);
+ spin_lock(&root->orphan_lock);
list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
- spin_unlock(&root->list_lock);
+ spin_unlock(&root->orphan_lock);
/*
* if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
* do a destroy_inode
*/
if (is_bad_inode(inode)) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
btrfs_orphan_del(trans, inode);
btrfs_end_transaction(trans, root);
iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
/* this will do delete_inode and everything for us */
iput(inode);
}
+ btrfs_free_path(path);
+
+ root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+
+ if (root->orphan_block_rsv)
+ btrfs_block_rsv_release(root, root->orphan_block_rsv,
+ (u64)-1);
+
+ if (root->orphan_block_rsv || root->orphan_item_inserted) {
+ trans = btrfs_join_transaction(root, 1);
+ btrfs_end_transaction(trans, root);
+ }
if (nr_unlink)
printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
if (nr_truncate)
printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-
- btrfs_free_path(path);
}
/*
@@ -2478,29 +2666,201 @@ out:
return ret;
}
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ struct extent_buffer *eb;
+ int level;
+ int ret;
+ u64 refs;
+
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ if (!path->nodes[level])
+ break;
+ eb = path->nodes[level];
+ if (!btrfs_block_can_be_shared(root, eb))
+ continue;
+ ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+ &refs, NULL);
+ if (refs > 1)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+ struct dentry *dentry)
{
- struct btrfs_root *root;
struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_path *path;
+ struct btrfs_inode_ref *ref;
+ struct btrfs_dir_item *di;
struct inode *inode = dentry->d_inode;
+ u64 index;
+ int check_link = 1;
+ int err = -ENOSPC;
int ret;
- unsigned long nr = 0;
- root = BTRFS_I(dir)->root;
+ trans = btrfs_start_transaction(root, 10);
+ if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+ return trans;
- /*
- * 5 items for unlink inode
- * 1 for orphan
- */
- ret = btrfs_reserve_metadata_space(root, 6);
- if (ret)
- return ret;
+ if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return ERR_PTR(-ENOSPC);
+
+ /* check if there is someone else holds reference */
+ if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
+ return ERR_PTR(-ENOSPC);
+
+ if (atomic_read(&inode->i_count) > 2)
+ return ERR_PTR(-ENOSPC);
+
+ if (xchg(&root->fs_info->enospc_unlink, 1))
+ return ERR_PTR(-ENOSPC);
- trans = btrfs_start_transaction(root, 1);
+ path = btrfs_alloc_path();
+ if (!path) {
+ root->fs_info->enospc_unlink = 0;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
- btrfs_unreserve_metadata_space(root, 6);
- return PTR_ERR(trans);
+ btrfs_free_path(path);
+ root->fs_info->enospc_unlink = 0;
+ return trans;
+ }
+
+ path->skip_locking = 1;
+ path->search_commit_root = 1;
+
+ ret = btrfs_lookup_inode(trans, root, path,
+ &BTRFS_I(dir)->location, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret == 0) {
+ if (check_path_shared(root, path))
+ goto out;
+ } else {
+ check_link = 0;
+ }
+ btrfs_release_path(root, path);
+
+ ret = btrfs_lookup_inode(trans, root, path,
+ &BTRFS_I(inode)->location, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret == 0) {
+ if (check_path_shared(root, path))
+ goto out;
+ } else {
+ check_link = 0;
+ }
+ btrfs_release_path(root, path);
+
+ if (ret == 0 && S_ISREG(inode->i_mode)) {
+ ret = btrfs_lookup_file_extent(trans, root, path,
+ inode->i_ino, (u64)-1, 0);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ BUG_ON(ret == 0);
+ if (check_path_shared(root, path))
+ goto out;
+ btrfs_release_path(root, path);
+ }
+
+ if (!check_link) {
+ err = 0;
+ goto out;
+ }
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ dentry->d_name.name, dentry->d_name.len, 0);
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto out;
+ }
+ if (di) {
+ if (check_path_shared(root, path))
+ goto out;
+ } else {
+ err = 0;
+ goto out;
}
+ btrfs_release_path(root, path);
+
+ ref = btrfs_lookup_inode_ref(trans, root, path,
+ dentry->d_name.name, dentry->d_name.len,
+ inode->i_ino, dir->i_ino, 0);
+ if (IS_ERR(ref)) {
+ err = PTR_ERR(ref);
+ goto out;
+ }
+ BUG_ON(!ref);
+ if (check_path_shared(root, path))
+ goto out;
+ index = btrfs_inode_ref_index(path->nodes[0], ref);
+ btrfs_release_path(root, path);
+
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+ dentry->d_name.name, dentry->d_name.len, 0);
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto out;
+ }
+ BUG_ON(ret == -ENOENT);
+ if (check_path_shared(root, path))
+ goto out;
+
+ err = 0;
+out:
+ btrfs_free_path(path);
+ if (err) {
+ btrfs_end_transaction(trans, root);
+ root->fs_info->enospc_unlink = 0;
+ return ERR_PTR(err);
+ }
+
+ trans->block_rsv = &root->fs_info->global_block_rsv;
+ return trans;
+}
+
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+ BUG_ON(!root->fs_info->enospc_unlink);
+ root->fs_info->enospc_unlink = 0;
+ }
+ btrfs_end_transaction_throttle(trans, root);
+}
+
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_trans_handle *trans;
+ struct inode *inode = dentry->d_inode;
+ int ret;
+ unsigned long nr = 0;
+
+ trans = __unlink_start_trans(dir, dentry);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
btrfs_set_trans_block_group(trans, dir);
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
+ BUG_ON(ret);
- if (inode->i_nlink == 0)
+ if (inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, inode);
+ BUG_ON(ret);
+ }
nr = trans->blocks_used;
-
- btrfs_end_transaction_throttle(trans, root);
- btrfs_unreserve_metadata_space(root, 6);
+ __unlink_end_trans(trans, root);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
@@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
int err = 0;
- int ret;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
unsigned long nr = 0;
@@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -ENOTEMPTY;
- ret = btrfs_reserve_metadata_space(root, 5);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- btrfs_unreserve_metadata_space(root, 5);
+ trans = __unlink_start_trans(dir, dentry);
+ if (IS_ERR(trans))
return PTR_ERR(trans);
- }
btrfs_set_trans_block_group(trans, dir);
@@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
btrfs_i_size_write(inode, 0);
out:
nr = trans->blocks_used;
- ret = btrfs_end_transaction_throttle(trans, root);
- btrfs_unreserve_metadata_space(root, 5);
+ __unlink_end_trans(trans, root);
btrfs_btree_balance_dirty(root, nr);
- if (ret && !err)
- err = ret;
return err;
}
@@ -3029,6 +3380,7 @@ out:
if (pending_del_nr) {
ret = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
+ BUG_ON(ret);
}
btrfs_free_path(path);
return err;
@@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
if ((offset & (blocksize - 1)) == 0)
goto out;
- ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
- if (ret)
- goto out;
-
- ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret)
goto out;
@@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
again:
page = grab_cache_page(mapping, index);
if (!page) {
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
goto out;
}
@@ -3132,8 +3479,7 @@ again:
out_unlock:
if (ret)
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
unlock_page(page);
page_cache_release(page);
out:
@@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em;
+ struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
u64 mask = root->sectorsize - 1;
u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
u64 hint_byte = 0;
hole_size = last_byte - cur_offset;
- err = btrfs_reserve_metadata_space(root, 2);
- if (err)
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
break;
-
- trans = btrfs_start_transaction(root, 1);
+ }
btrfs_set_trans_block_group(trans, inode);
err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
last_byte - 1, 0);
btrfs_end_transaction(trans, root);
- btrfs_unreserve_metadata_space(root, 2);
}
free_extent_map(em);
+ em = NULL;
cur_offset = last_byte;
if (cur_offset >= block_end)
break;
}
+ free_extent_map(em);
unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
GFP_NOFS);
return err;
@@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
}
}
- ret = btrfs_reserve_metadata_space(root, 1);
- if (ret)
- return ret;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
- btrfs_unreserve_metadata_space(root, 1);
btrfs_btree_balance_dirty(root, nr);
if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
i_size_write(inode, attr->ia_size);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = root->orphan_block_rsv;
+ BUG_ON(!trans->block_rsv);
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
@@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
btrfs_i_size_write(inode, 0);
while (1) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
- ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+ trans->block_rsv = root->orphan_block_rsv;
+
+ ret = btrfs_block_rsv_check(trans, root,
+ root->orphan_block_rsv, 0, 5);
+ if (ret) {
+ BUG_ON(ret != -EAGAIN);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ continue;
+ }
+ ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
if (ret != -EAGAIN)
break;
@@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
btrfs_end_transaction(trans, root);
trans = NULL;
btrfs_btree_balance_dirty(root, nr);
+
}
if (ret == 0) {
@@ -3596,40 +3956,10 @@ again:
return 0;
}
-static noinline void init_btrfs_i(struct inode *inode)
-{
- struct btrfs_inode *bi = BTRFS_I(inode);
-
- bi->generation = 0;
- bi->sequence = 0;
- bi->last_trans = 0;
- bi->last_sub_trans = 0;
- bi->logged_trans = 0;
- bi->delalloc_bytes = 0;
- bi->reserved_bytes = 0;
- bi->disk_i_size = 0;
- bi->flags = 0;
- bi->index_cnt = (u64)-1;
- bi->last_unlink_trans = 0;
- bi->ordered_data_close = 0;
- bi->force_compress = 0;
- extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
- extent_io_tree_init(&BTRFS_I(inode)->io_tree,
- inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
- inode->i_mapping, GFP_NOFS);
- INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
- INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
- btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
- mutex_init(&BTRFS_I(inode)->log_mutex);
-}
-
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
inode->i_ino = args->ino;
- init_btrfs_i(inode);
BTRFS_I(inode)->root = args->root;
btrfs_set_inode_space_info(args->root, inode);
return 0;
@@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
if (!inode)
return ERR_PTR(-ENOMEM);
- init_btrfs_i(inode);
-
BTRFS_I(inode)->root = root;
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
struct btrfs_trans_handle *trans;
int ret = 0;
- if (root->fs_info->btree_inode == inode)
+ if (BTRFS_I(inode)->dummy_inode)
return 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
+ int ret;
+
+ if (BTRFS_I(inode)->dummy_inode)
+ return;
trans = btrfs_join_transaction(root, 1);
btrfs_set_trans_block_group(trans, inode);
- btrfs_update_inode(trans, root, inode);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret && ret == -ENOSPC) {
+ /* whoops, lets try again with the full transaction */
+ btrfs_end_transaction(trans, root);
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ if (printk_ratelimit()) {
+ printk(KERN_ERR "btrfs: fail to "
+ "dirty inode %lu error %ld\n",
+ inode->i_ino, PTR_ERR(trans));
+ }
+ return;
+ }
+ btrfs_set_trans_block_group(trans, inode);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ if (printk_ratelimit()) {
+ printk(KERN_ERR "btrfs: fail to "
+ "dirty inode %lu error %d\n",
+ inode->i_ino, ret);
+ }
+ }
+ }
btrfs_end_transaction(trans, root);
}
@@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* btrfs_get_inode_index_count has an explanation for the magic
* number
*/
- init_btrfs_i(inode);
BTRFS_I(inode)->index_cnt = 2;
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->generation = trans->transid;
@@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
+
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- goto fail;
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_unlock;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino, objectid,
@@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-fail:
- btrfs_unreserve_metadata_space(root, 5);
+ btrfs_btree_balance_dirty(root, nr);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
}
- btrfs_btree_balance_dirty(root, nr);
return err;
}
@@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct inode *inode = NULL;
- int err;
int drop_inode = 0;
+ int err;
unsigned long nr = 0;
u64 objectid;
u64 index = 0;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- goto fail;
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_unlock;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino,
@@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-fail:
- btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (root->objectid != BTRFS_I(inode)->root->objectid)
return -EPERM;
- /*
- * 1 item for inode ref
- * 2 items for dir items
- */
- err = btrfs_reserve_metadata_space(root, 3);
- if (err)
- return err;
-
btrfs_inc_nlink(inode);
err = btrfs_set_inode_index(dir, &index);
if (err)
goto fail;
- trans = btrfs_start_transaction(root, 1);
+ /*
+ * 1 item for inode ref
+ * 2 items for dir items
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto fail;
+ }
btrfs_set_trans_block_group(trans, dir);
atomic_inc(&inode->i_count);
@@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
- btrfs_unreserve_metadata_space(root, 3);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
u64 index = 0;
unsigned long nr = 1;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
+
/*
* 2 items for inode and ref
* 2 items for dir items
* 1 for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
-
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- err = -ENOMEM;
- goto out_unlock;
- }
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_fail;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino, objectid,
@@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
out_fail:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-
-out_unlock:
- btrfs_unreserve_metadata_space(root, 5);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
@@ -4770,6 +5098,7 @@ again:
}
flush_dcache_page(page);
} else if (create && PageUptodate(page)) {
+ WARN_ON(1);
if (!trans) {
kunmap(page);
free_extent_map(em);
@@ -4866,11 +5195,651 @@ out:
return em;
}
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+ u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct extent_map *em;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct btrfs_key ins;
+ u64 alloc_hint;
+ int ret;
+
+ btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+
+ trans = btrfs_join_transaction(root, 0);
+ if (!trans)
+ return ERR_PTR(-ENOMEM);
+
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ alloc_hint = get_extent_allocation_hint(inode, start, len);
+ ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+ alloc_hint, (u64)-1, &ins, 1);
+ if (ret) {
+ em = ERR_PTR(ret);
+ goto out;
+ }
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em) {
+ em = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ em->start = start;
+ em->orig_start = em->start;
+ em->len = ins.offset;
+
+ em->block_start = ins.objectid;
+ em->block_len = ins.offset;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST)
+ break;
+ btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+ }
+
+ ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+ ins.offset, ins.offset, 0);
+ if (ret) {
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ em = ERR_PTR(ret);
+ }
+out:
+ btrfs_end_transaction(trans, root);
+ return em;
+}
+
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+ struct inode *inode, u64 offset, u64 len)
+{
+ struct btrfs_path *path;
+ int ret;
+ struct extent_buffer *leaf;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 disk_bytenr;
+ u64 backref_offset;
+ u64 extent_end;
+ u64 num_bytes;
+ int slot;
+ int found_type;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+ offset, 0);
+ if (ret < 0)
+ goto out;
+
+ slot = path->slots[0];
+ if (ret == 1) {
+ if (slot == 0) {
+ /* can't find the item, must cow */
+ ret = 0;
+ goto out;
+ }
+ slot--;
+ }
+ ret = 0;
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != inode->i_ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ /* not our file or wrong item type, must cow */
+ goto out;
+ }
+
+ if (key.offset > offset) {
+ /* Wrong offset, must cow */
+ goto out;
+ }
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ found_type = btrfs_file_extent_type(leaf, fi);
+ if (found_type != BTRFS_FILE_EXTENT_REG &&
+ found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+ /* not a regular extent, must cow */
+ goto out;
+ }
+ disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_end < offset + len) {
+ /* extent doesn't include our full range, must cow */
+ goto out;
+ }
+
+ if (btrfs_extent_readonly(root, disk_bytenr))
+ goto out;
+
+ /*
+ * look for other files referencing this extent, if we
+ * find any we must cow
+ */
+ if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+ key.offset - backref_offset, disk_bytenr))
+ goto out;
+
+ /*
+ * adjust disk_bytenr and num_bytes to cover just the bytes
+ * in this extent we are about to write. If there
+ * are any csums in that range we have to cow in order
+ * to keep the csums correct
+ */
+ disk_bytenr += backref_offset;
+ disk_bytenr += offset - key.offset;
+ num_bytes = min(offset + len, extent_end) - offset;
+ if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+ goto out;
+ /*
+ * all of the above have passed, it is safe to overwrite this extent
+ * without cow
+ */
+ ret = 1;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct extent_map *em;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 start = iblock << inode->i_blkbits;
+ u64 len = bh_result->b_size;
+ struct btrfs_trans_handle *trans;
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+
+ /*
+ * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+ * io. INLINE is special, and we could probably kludge it in here, but
+ * it's still buffered so for safety lets just fall back to the generic
+ * buffered path.
+ *
+ * For COMPRESSED we _have_ to read the entire extent in so we can
+ * decompress it, so there will be buffering required no matter what we
+ * do, so go ahead and fallback to buffered.
+ *
+ * We return -ENOTBLK because thats what makes DIO go ahead and go back
+ * to buffered IO. Don't blame me, this is the price we pay for using
+ * the generic code.
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ em->block_start == EXTENT_MAP_INLINE) {
+ free_extent_map(em);
+ return -ENOTBLK;
+ }
+
+ /* Just a good old fashioned hole, return */
+ if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ free_extent_map(em);
+ /* DIO will do one hole at a time, so just unlock a sector */
+ unlock_extent(&BTRFS_I(inode)->io_tree, start,
+ start + root->sectorsize - 1, GFP_NOFS);
+ return 0;
+ }
+
+ /*
+ * We don't allocate a new extent in the following cases
+ *
+ * 1) The inode is marked as NODATACOW. In this case we'll just use the
+ * existing extent.
+ * 2) The extent is marked as PREALLOC. We're good to go here and can
+ * just use the extent.
+ *
+ */
+ if (!create) {
+ len = em->len - (start - em->start);
+ goto map;
+ }
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ em->block_start != EXTENT_MAP_HOLE)) {
+ int type;
+ int ret;
+ u64 block_start;
+
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ type = BTRFS_ORDERED_PREALLOC;
+ else
+ type = BTRFS_ORDERED_NOCOW;
+ len = min(len, em->len - (start - em->start));
+ block_start = em->block_start + (start - em->start);
+
+ /*
+ * we're not going to log anything, but we do need
+ * to make sure the current transaction stays open
+ * while we look for nocow cross refs
+ */
+ trans = btrfs_join_transaction(root, 0);
+ if (!trans)
+ goto must_cow;
+
+ if (can_nocow_odirect(trans, inode, start, len) == 1) {
+ ret = btrfs_add_ordered_extent_dio(inode, start,
+ block_start, len, len, type);
+ btrfs_end_transaction(trans, root);
+ if (ret) {
+ free_extent_map(em);
+ return ret;
+ }
+ goto unlock;
+ }
+ btrfs_end_transaction(trans, root);
+ }
+must_cow:
+ /*
+ * this will cow the extent, reset the len in case we changed
+ * it above
+ */
+ len = bh_result->b_size;
+ free_extent_map(em);
+ em = btrfs_new_extent_direct(inode, start, len);
+ if (IS_ERR(em))
+ return PTR_ERR(em);
+ len = min(len, em->len - (start - em->start));
+unlock:
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+ 0, NULL, GFP_NOFS);
+map:
+ bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+ inode->i_blkbits;
+ bh_result->b_size = len;
+ bh_result->b_bdev = em->bdev;
+ set_buffer_mapped(bh_result);
+ if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ set_buffer_new(bh_result);
+
+ free_extent_map(em);
+
+ return 0;
+}
+
+struct btrfs_dio_private {
+ struct inode *inode;
+ u64 logical_offset;
+ u64 disk_bytenr;
+ u64 bytes;
+ u32 *csums;
+ void *private;
+};
+
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+ struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 start;
+ u32 *private = dip->csums;
+
+ start = dip->logical_offset;
+ do {
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ struct page *page = bvec->bv_page;
+ char *kaddr;
+ u32 csum = ~(u32)0;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kaddr = kmap_atomic(page, KM_IRQ0);
+ csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+ csum, bvec->bv_len);
+ btrfs_csum_final(csum, (char *)&csum);
+ kunmap_atomic(kaddr, KM_IRQ0);
+ local_irq_restore(flags);
+
+ flush_dcache_page(bvec->bv_page);
+ if (csum != *private) {
+ printk(KERN_ERR "btrfs csum failed ino %lu off"
+ " %llu csum %u private %u\n",
+ inode->i_ino, (unsigned long long)start,
+ csum, *private);
+ err = -EIO;
+ }
+ }
+
+ start += bvec->bv_len;
+ private++;
+ bvec++;
+ } while (bvec <= bvec_end);
+
+ unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+ dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+ bio->bi_private = dip->private;
+
+ kfree(dip->csums);
+ kfree(dip);
+ dio_end_io(bio, err);
+}
+
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+ struct btrfs_dio_private *dip = bio->bi_private;
+ struct inode *inode = dip->inode;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_ordered_extent *ordered = NULL;
+ struct extent_state *cached_state = NULL;
+ int ret;
+
+ if (err)
+ goto out_done;
+
+ ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+ dip->logical_offset, dip->bytes);
+ if (!ret)
+ goto out_done;
+
+ BUG_ON(!ordered);
+
+ trans = btrfs_join_transaction(root, 1);
+ if (!trans) {
+ err = -ENOMEM;
+ goto out;
+ }
+ trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+ ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+ if (!ret)
+ ret = btrfs_update_inode(trans, root, inode);
+ err = ret;
+ goto out;
+ }
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+ ordered->file_offset + ordered->len - 1, 0,
+ &cached_state, GFP_NOFS);
+
+ if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+ ret = btrfs_mark_extent_written(trans, inode,
+ ordered->file_offset,
+ ordered->file_offset +
+ ordered->len);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
+ } else {
+ ret = insert_reserved_file_extent(trans, inode,
+ ordered->file_offset,
+ ordered->start,
+ ordered->disk_len,
+ ordered->len,
+ ordered->len,
+ 0, 0, 0,
+ BTRFS_FILE_EXTENT_REG);
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered->file_offset, ordered->len);
+ if (ret) {
+ err = ret;
+ WARN_ON(1);
+ goto out_unlock;
+ }
+ }
+
+ add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+ btrfs_ordered_update_i_size(inode, 0, ordered);
+ btrfs_update_inode(trans, root, inode);
+out_unlock:
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+ ordered->file_offset + ordered->len - 1,
+ &cached_state, GFP_NOFS);
+out:
+ btrfs_delalloc_release_metadata(inode, ordered->len);
+ btrfs_end_transaction(trans, root);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+out_done:
+ bio->bi_private = dip->private;
+
+ kfree(dip->csums);
+ kfree(dip);
+ dio_end_io(bio, err);
+}
+
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+ struct bio *bio, int mirror_num,
+ unsigned long bio_flags, u64 offset)
+{
+ int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+ BUG_ON(ret);
+ return 0;
+}
+
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+ loff_t file_offset)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_dio_private *dip;
+ struct bio_vec *bvec = bio->bi_io_vec;
+ u64 start;
+ int skip_sum;
+ int write = rw & (1 << BIO_RW);
+ int ret = 0;
+
+ skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+ dip = kmalloc(sizeof(*dip), GFP_NOFS);
+ if (!dip) {
+ ret = -ENOMEM;
+ goto free_ordered;
+ }
+ dip->csums = NULL;
+
+ if (!skip_sum) {
+ dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+ if (!dip->csums) {
+ ret = -ENOMEM;
+ goto free_ordered;
+ }
+ }
+
+ dip->private = bio->bi_private;
+ dip->inode = inode;
+ dip->logical_offset = file_offset;
+
+ start = dip->logical_offset;
+ dip->bytes = 0;
+ do {
+ dip->bytes += bvec->bv_len;
+ bvec++;
+ } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+
+ dip->disk_bytenr = (u64)bio->bi_sector << 9;
+ bio->bi_private = dip;
+
+ if (write)
+ bio->bi_end_io = btrfs_endio_direct_write;
+ else
+ bio->bi_end_io = btrfs_endio_direct_read;
+
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ if (ret)
+ goto out_err;
+
+ if (write && !skip_sum) {
+ ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, 0, 0,
+ dip->logical_offset,
+ __btrfs_submit_bio_start_direct_io,
+ __btrfs_submit_bio_done);
+ if (ret)
+ goto out_err;
+ return;
+ } else if (!skip_sum)
+ btrfs_lookup_bio_sums_dio(root, inode, bio,
+ dip->logical_offset, dip->csums);
+
+ ret = btrfs_map_bio(root, rw, bio, 0, 1);
+ if (ret)
+ goto out_err;
+ return;
+out_err:
+ kfree(dip->csums);
+ kfree(dip);
+free_ordered:
+ /*
+ * If this is a write, we need to clean up the reserved space and kill
+ * the ordered extent.
+ */
+ if (write) {
+ struct btrfs_ordered_extent *ordered;
+ ordered = btrfs_lookup_ordered_extent(inode,
+ dip->logical_offset);
+ if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+ btrfs_free_reserved_extent(root, ordered->start,
+ ordered->disk_len);
+ btrfs_put_ordered_extent(ordered);
+ btrfs_put_ordered_extent(ordered);
+ }
+ bio_endio(bio, ret);
+}
+
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+{
+ int seg;
+ size_t size;
+ unsigned long addr;
+ unsigned blocksize_mask = root->sectorsize - 1;
+ ssize_t retval = -EINVAL;
+ loff_t end = offset;
+
+ if (offset & blocksize_mask)
+ goto out;
+
+ /* Check the memory alignment. Blocks cannot straddle pages */
+ for (seg = 0; seg < nr_segs; seg++) {
+ addr = (unsigned long)iov[seg].iov_base;
+ size = iov[seg].iov_len;
+ end += size;
+ if ((addr & blocksize_mask) || (size & blocksize_mask))
+ goto out;
+ }
+ retval = 0;
+out:
+ return retval;
+}
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset,
unsigned long nr_segs)
{
- return -EINVAL;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_state *cached_state = NULL;
+ u64 lockstart, lockend;
+ ssize_t ret;
+ int writing = rw & WRITE;
+ int write_bits = 0;
+ size_t count = iov_length(iov, nr_segs);
+
+ if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+ offset, nr_segs)) {
+ return 0;
+ }
+
+ lockstart = offset;
+ lockend = offset + count - 1;
+
+ if (writing) {
+ ret = btrfs_delalloc_reserve_space(inode, count);
+ if (ret)
+ goto out;
+ }
+
+ while (1) {
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ 0, &cached_state, GFP_NOFS);
+ /*
+ * We're concerned with the entire range that we're going to be
+ * doing DIO to, so we need to make sure theres no ordered
+ * extents in this range.
+ */
+ ordered = btrfs_lookup_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (!ordered)
+ break;
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ cond_resched();
+ }
+
+ /*
+ * we don't use btrfs_set_extent_delalloc because we don't want
+ * the dirty or uptodate bits
+ */
+ if (writing) {
+ write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+ ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_DELALLOC, 0, NULL, &cached_state,
+ GFP_NOFS);
+ if (ret) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, EXTENT_LOCKED | write_bits,
+ 1, 0, &cached_state, GFP_NOFS);
+ goto out;
+ }
+ }
+
+ free_extent_state(cached_state);
+ cached_state = NULL;
+
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+ iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ btrfs_submit_direct, 0);
+
+ if (ret < 0 && ret != -EIOCBQUEUED) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+ offset + iov_length(iov, nr_segs) - 1,
+ EXTENT_LOCKED | write_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+ /*
+ * We're falling back to buffered, unlock the section we didn't
+ * do IO on.
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+ offset + iov_length(iov, nr_segs) - 1,
+ EXTENT_LOCKED | write_bits, 1, 0,
+ &cached_state, GFP_NOFS);
+ }
+out:
+ free_extent_state(cached_state);
+ return ret;
}
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_start;
u64 page_end;
- ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
if (ret) {
if (ret == -ENOMEM)
ret = VM_FAULT_OOM;
@@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
}
- ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- if (ret) {
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
@@ -5059,7 +6021,6 @@ again:
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
/* page got truncated out from underneath us */
goto out_unlock;
}
@@ -5100,7 +6061,6 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state, GFP_NOFS);
ret = VM_FAULT_SIGBUS;
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
goto out_unlock;
}
ret = 0;
@@ -5127,10 +6087,10 @@ again:
unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
out_unlock:
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
if (!ret)
return VM_FAULT_LOCKED;
unlock_page(page);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
out:
return ret;
}
@@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = root->orphan_block_rsv;
/*
* setattr is responsible for setting the ordered_data_close flag,
@@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
btrfs_add_ordered_operation(trans, root, inode);
while (1) {
+ if (!trans) {
+ trans = btrfs_start_transaction(root, 0);
+ BUG_ON(IS_ERR(trans));
+ btrfs_set_trans_block_group(trans, inode);
+ trans->block_rsv = root->orphan_block_rsv;
+ }
+
+ ret = btrfs_block_rsv_check(trans, root,
+ root->orphan_block_rsv, 0, 5);
+ if (ret) {
+ BUG_ON(ret != -EAGAIN);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ trans = NULL;
+ continue;
+ }
+
ret = btrfs_truncate_inode_items(trans, root, inode,
inode->i_size,
BTRFS_EXTENT_DATA_KEY);
@@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
+ trans = NULL;
btrfs_btree_balance_dirty(root, nr);
-
- trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, inode);
}
if (ret == 0 && inode->i_nlink > 0) {
@@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
struct inode *btrfs_alloc_inode(struct super_block *sb)
{
struct btrfs_inode *ei;
+ struct inode *inode;
ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
+
+ ei->root = NULL;
+ ei->space_info = NULL;
+ ei->generation = 0;
+ ei->sequence = 0;
ei->last_trans = 0;
ei->last_sub_trans = 0;
ei->logged_trans = 0;
- ei->outstanding_extents = 0;
- ei->reserved_extents = 0;
- ei->root = NULL;
+ ei->delalloc_bytes = 0;
+ ei->reserved_bytes = 0;
+ ei->disk_i_size = 0;
+ ei->flags = 0;
+ ei->index_cnt = (u64)-1;
+ ei->last_unlink_trans = 0;
+
spin_lock_init(&ei->accounting_lock);
+ atomic_set(&ei->outstanding_extents, 0);
+ ei->reserved_extents = 0;
+
+ ei->ordered_data_close = 0;
+ ei->orphan_meta_reserved = 0;
+ ei->dummy_inode = 0;
+ ei->force_compress = 0;
+
+ inode = &ei->vfs_inode;
+ extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+ extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+ extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+ mutex_init(&ei->log_mutex);
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
+ INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->ordered_operations);
- return &ei->vfs_inode;
+ RB_CLEAR_NODE(&ei->rb_node);
+
+ return inode;
}
void btrfs_destroy_inode(struct inode *inode)
@@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
WARN_ON(!list_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
+ WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+ WARN_ON(BTRFS_I(inode)->reserved_extents);
/*
* This can happen where we create an inode, but somebody else also
@@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
spin_unlock(&root->fs_info->ordered_extent_lock);
}
- spin_lock(&root->list_lock);
+ spin_lock(&root->orphan_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
inode->i_ino);
list_del_init(&BTRFS_I(inode)->i_orphan);
}
- spin_unlock(&root->list_lock);
+ spin_unlock(&root->orphan_lock);
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old_inode->i_mode) && new_inode &&
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
-
- /*
- * We want to reserve the absolute worst case amount of items. So if
- * both inodes are subvols and we need to unlink them then that would
- * require 4 item modifications, but if they are both normal inodes it
- * would require 5 item modifications, so we'll assume their normal
- * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
- * should cover the worst case number of items we'll modify.
- */
- ret = btrfs_reserve_metadata_space(root, 11);
- if (ret)
- return ret;
-
/*
* we're using rename to replace one file with another.
* and the replacement file is large. Start IO on it now so
@@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* close the racy window with snapshot create/destroy ioctl */
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&root->fs_info->subvol_sem);
+ /*
+ * We want to reserve the absolute worst case amount of items. So if
+ * both inodes are subvols and we need to unlink them then that would
+ * require 4 item modifications, but if they are both normal inodes it
+ * would require 5 item modifications, so we'll assume their normal
+ * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+ * should cover the worst case number of items we'll modify.
+ */
+ trans = btrfs_start_transaction(root, 20);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, new_dir);
if (dest != root)
@@ -5550,7 +6552,6 @@ out_fail:
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
- btrfs_unreserve_metadata_space(root, 11);
return ret;
}
@@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
return 0;
}
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+ struct btrfs_inode *binode;
+ struct inode *inode = NULL;
+
+ spin_lock(&root->fs_info->delalloc_lock);
+ while (!list_empty(&root->fs_info->delalloc_inodes)) {
+ binode = list_entry(root->fs_info->delalloc_inodes.next,
+ struct btrfs_inode, delalloc_inodes);
+ inode = igrab(&binode->vfs_inode);
+ if (inode) {
+ list_move_tail(&binode->delalloc_inodes,
+ &root->fs_info->delalloc_inodes);
+ break;
+ }
+
+ list_del_init(&binode->delalloc_inodes);
+ cond_resched_lock(&root->fs_info->delalloc_lock);
+ }
+ spin_unlock(&root->fs_info->delalloc_lock);
+
+ if (inode) {
+ write_inode_now(inode, 0);
+ if (delay_iput)
+ btrfs_add_delayed_iput(inode);
+ else
+ iput(inode);
+ return 1;
+ }
+ return 0;
+}
+
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
const char *symname)
{
@@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
+ err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+ if (err)
+ return err;
/*
* 2 items for inode item and ref
* 2 items for dir items
* 1 item for xattr if selinux is on
*/
- err = btrfs_reserve_metadata_space(root, 5);
- if (err)
- return err;
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- goto out_fail;
btrfs_set_trans_block_group(trans, dir);
- err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
- if (err) {
- err = -ENOSPC;
- goto out_unlock;
- }
-
inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
dentry->d_name.len,
dentry->d_parent->d_inode->i_ino, objectid,
@@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
-out_fail:
- btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -5726,33 +6751,28 @@ out_fail:
return err;
}
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
- u64 alloc_hint, int mode, loff_t actual_len)
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+ u64 start, u64 num_bytes, u64 min_size,
+ loff_t actual_len, u64 *alloc_hint)
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
- u64 num_bytes = end - start;
int ret = 0;
- u64 i_size;
while (num_bytes > 0) {
- trans = btrfs_start_transaction(root, 1);
-
- ret = btrfs_reserve_extent(trans, root, num_bytes,
- root->sectorsize, 0, alloc_hint,
- (u64)-1, &ins, 1);
- if (ret) {
- WARN_ON(1);
- goto stop_trans;
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
}
- ret = btrfs_reserve_metadata_space(root, 3);
+ ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+ 0, *alloc_hint, (u64)-1, &ins, 1);
if (ret) {
- btrfs_free_reserved_extent(root, ins.objectid,
- ins.offset);
- goto stop_trans;
+ btrfs_end_transaction(trans, root);
+ break;
}
ret = insert_reserved_file_extent(trans, inode,
@@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
num_bytes -= ins.offset;
cur_offset += ins.offset;
- alloc_hint = ins.objectid + ins.offset;
+ *alloc_hint = ins.objectid + ins.offset;
inode->i_ctime = CURRENT_TIME;
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (actual_len > inode->i_size) &&
- (cur_offset > inode->i_size)) {
-
+ (actual_len > inode->i_size) &&
+ (cur_offset > inode->i_size)) {
if (cur_offset > actual_len)
- i_size = actual_len;
+ i_size_write(inode, actual_len);
else
- i_size = cur_offset;
- i_size_write(inode, i_size);
- btrfs_ordered_update_i_size(inode, i_size, NULL);
+ i_size_write(inode, cur_offset);
+ i_size_write(inode, cur_offset);
+ btrfs_ordered_update_i_size(inode, cur_offset, NULL);
}
ret = btrfs_update_inode(trans, root, inode);
BUG_ON(ret);
btrfs_end_transaction(trans, root);
- btrfs_unreserve_metadata_space(root, 3);
}
return ret;
-
-stop_trans:
- btrfs_end_transaction(trans, root);
- return ret;
-
}
static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
goto out;
}
- ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
- alloc_end - alloc_start);
+ ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
if (ret)
goto out;
@@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = prealloc_file_range(inode,
- cur_offset, last_byte,
- alloc_hint, mode, offset+len);
+ ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
+ last_byte - cur_offset,
+ 1 << inode->i_blkbits,
+ offset + len,
+ &alloc_hint);
if (ret < 0) {
free_extent_map(em);
break;
}
}
- if (em->block_start <= EXTENT_MAP_LAST_BYTE)
- alloc_hint = em->block_start;
free_extent_map(em);
cur_offset = last_byte;
@@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
- btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
- alloc_end - alloc_start);
+ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..4cdb98cf26de 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
+ ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+ 0, &objectid);
+ if (ret)
+ return ret;
/*
* 1 - inode item
* 2 - refs
* 1 - root item
* 2 - dir items
*/
- ret = btrfs_reserve_metadata_space(root, 6);
- if (ret)
- return ret;
-
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
-
- ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
- 0, &objectid);
- if (ret)
- goto fail;
+ trans = btrfs_start_transaction(root, 6);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
-
- btrfs_unreserve_metadata_space(root, 6);
return ret;
}
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
- char *name, int namelen)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
{
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!root->ref_cows)
return -EINVAL;
- /*
- * 1 - inode item
- * 2 - refs
- * 1 - root item
- * 2 - dir items
- */
- ret = btrfs_reserve_metadata_space(root, 6);
- if (ret)
- goto fail;
-
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot) {
- ret = -ENOMEM;
- btrfs_unreserve_metadata_space(root, 6);
- goto fail;
- }
- pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
- if (!pending_snapshot->name) {
- ret = -ENOMEM;
- kfree(pending_snapshot);
- btrfs_unreserve_metadata_space(root, 6);
- goto fail;
- }
- memcpy(pending_snapshot->name, name, namelen);
- pending_snapshot->name[namelen] = '\0';
+ if (!pending_snapshot)
+ return -ENOMEM;
+
+ btrfs_init_block_rsv(&pending_snapshot->block_rsv);
pending_snapshot->dentry = dentry;
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
pending_snapshot->root = root;
+
+ trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto fail;
+ }
+
+ ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
+ BUG_ON(ret);
+
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
- ret = btrfs_commit_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
BUG_ON(ret);
- btrfs_unreserve_metadata_space(root, 6);
+
+ ret = pending_snapshot->error;
+ if (ret)
+ goto fail;
+
+ btrfs_orphan_cleanup(pending_snapshot->snap);
inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
d_instantiate(dentry, inode);
ret = 0;
fail:
+ kfree(pending_snapshot);
return ret;
}
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
goto out_up_read;
if (snap_src) {
- error = create_snapshot(snap_src, dentry,
- name, namelen);
+ error = create_snapshot(snap_src, dentry);
} else {
error = create_subvol(BTRFS_I(dir)->root, dentry,
name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
BTRFS_I(inode)->force_compress = 1;
- ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
- if (ret) {
- ret = -ENOSPC;
- break;
- }
-
- ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
- if (ret) {
- btrfs_free_reserved_data_space(root, inode,
- PAGE_CACHE_SIZE);
- ret = -ENOSPC;
- break;
- }
+ ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto err_unlock;
again:
if (inode->i_size == 0 ||
i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
}
page = grab_cache_page(inode->i_mapping, i);
- if (!page)
+ if (!page) {
+ ret = -ENOMEM;
goto err_reservations;
+ }
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
+ ret = -EIO;
goto err_reservations;
}
}
@@ -644,8 +623,7 @@ again:
wait_on_page_writeback(page);
if (PageDirty(page)) {
- btrfs_free_reserved_data_space(root, inode,
- PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
goto loop_unlock;
}
@@ -683,7 +661,6 @@ loop_unlock:
page_cache_release(page);
mutex_unlock(&inode->i_mutex);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
i++;
}
@@ -713,9 +690,9 @@ loop_unlock:
return 0;
err_reservations:
+ btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+err_unlock:
mutex_unlock(&inode->i_mutex);
- btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
- btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
return ret;
}
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
device->name, (unsigned long long)new_size);
if (new_size > old_size) {
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans, root);
} else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out_up_write;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ goto out;
+ }
+ trans->block_rsv = &root->fs_info->global_block_rsv;
+
ret = btrfs_unlink_subvol(trans, root, dir,
dest->root_key.objectid,
dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_item.drop_level = 0;
btrfs_set_root_refs(&dest->root_item, 0);
- ret = btrfs_insert_orphan_item(trans,
- root->fs_info->tree_root,
- dest->root_key.objectid);
- BUG_ON(ret);
+ if (!xchg(&dest->orphan_item_inserted, 1)) {
+ ret = btrfs_insert_orphan_item(trans,
+ root->fs_info->tree_root,
+ dest->root_key.objectid);
+ BUG_ON(ret);
+ }
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EPERM;
goto out;
}
- btrfs_defrag_root(root, 0);
- btrfs_defrag_root(root->fs_info->extent_root, 0);
+ ret = btrfs_defrag_root(root, 0);
+ if (ret)
+ goto out;
+ ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
break;
case S_IFREG:
if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range->len = (u64)-1;
}
- btrfs_defrag_file(file, range);
+ ret = btrfs_defrag_file(file, range);
kfree(range);
break;
+ default:
+ ret = -EINVAL;
}
out:
mnt_drop_write(file->f_path.mnt);
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
btrfs_wait_ordered_range(src, off, off+len);
}
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
-
- /* punch hole in destination first */
- btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
-
/* clone data */
key.objectid = src->i_ino;
key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* note the key will change type as we walk through the
* tree.
*/
- ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.objectid = inode->i_ino;
new_key.offset = key.offset + destoff - off;
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
+ if (off > key.offset) {
+ datao += off - key.offset;
+ datal -= off - key.offset;
+ }
+
+ if (key.offset + datal > off + len)
+ datal = off + len - key.offset;
+
+ ret = btrfs_drop_extents(trans, inode,
+ new_key.offset,
+ new_key.offset + datal,
+ &hint_byte, 1);
+ BUG_ON(ret);
+
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- if (ret)
- goto out;
+ BUG_ON(ret);
leaf = path->nodes[0];
slot = path->slots[0];
@@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
- if (off > key.offset) {
- datao += off - key.offset;
- datal -= off - key.offset;
- }
-
- if (key.offset + datal > off + len)
- datal = off + len - key.offset;
-
/* disko == 0 means it's a hole */
if (!disko)
datao = 0;
@@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
if (comp && (skip || trim)) {
ret = -EINVAL;
+ btrfs_end_transaction(trans, root);
goto out;
}
size -= skip + trim;
datal -= skip + trim;
+
+ ret = btrfs_drop_extents(trans, inode,
+ new_key.offset,
+ new_key.offset + datal,
+ &hint_byte, 1);
+ BUG_ON(ret);
+
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
- if (ret)
- goto out;
+ BUG_ON(ret);
if (skip) {
u32 start =
@@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
}
btrfs_mark_buffer_dirty(leaf);
- }
+ btrfs_release_path(root, path);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ if (new_key.offset + datal > inode->i_size)
+ btrfs_i_size_write(inode,
+ new_key.offset + datal);
+ BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+ ret = btrfs_update_inode(trans, root, inode);
+ BUG_ON(ret);
+ btrfs_end_transaction(trans, root);
+ }
next:
btrfs_release_path(root, path);
key.offset++;
@@ -1717,17 +1727,7 @@ next:
ret = 0;
out:
btrfs_release_path(root, path);
- if (ret == 0) {
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- if (destoff + olen > inode->i_size)
- btrfs_i_size_write(inode, destoff + olen);
- BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
- ret = btrfs_update_inode(trans, root, inode);
- }
- btrfs_end_transaction(trans, root);
unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
- if (ret)
- vmtruncate(inode, 0);
out_unlock:
mutex_unlock(&src->i_mutex);
mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
return 1;
}
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+ u64 len)
+{
+ if (file_offset + len <= entry->file_offset ||
+ entry->file_offset + entry->len <= file_offset)
+ return 0;
+ return 1;
+}
+
/*
* look find the first ordered struct that has this offset, otherwise
* the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int type)
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len,
+ int type, int dio)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
+ if (dio)
+ set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
+
/* one ref for the tree */
atomic_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
return 0;
}
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0);
+}
+
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type)
+{
+ return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 1);
+}
+
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
- spin_lock(&BTRFS_I(inode)->accounting_lock);
- WARN_ON(!BTRFS_I(inode)->outstanding_extents);
- BTRFS_I(inode)->outstanding_extents--;
- spin_unlock(&BTRFS_I(inode)->accounting_lock);
- btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
- inode, 1);
-
spin_lock(&root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
* start IO on any dirty ones so the wait doesn't stall waiting
* for pdflush to find them
*/
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -588,6 +609,47 @@ out:
return entry;
}
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+ u64 file_offset,
+ u64 len)
+{
+ struct btrfs_ordered_inode_tree *tree;
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry = NULL;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock(&tree->lock);
+ node = tree_search(tree, file_offset);
+ if (!node) {
+ node = tree_search(tree, file_offset + len);
+ if (!node)
+ goto out;
+ }
+
+ while (1) {
+ entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+ if (range_overlaps(entry, file_offset, len))
+ break;
+
+ if (entry->file_offset >= file_offset + len) {
+ entry = NULL;
+ break;
+ }
+ entry = NULL;
+ node = rb_next(node);
+ if (!node)
+ break;
+ }
+out:
+ if (entry)
+ atomic_inc(&entry->refs);
+ spin_unlock(&tree->lock);
+ return entry;
+}
+
/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
- u64 start, u64 len, u64 disk_len, int tyep);
+ u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+ u64 start, u64 len, u64 disk_len, int type);
int btrfs_add_ordered_sum(struct inode *inode,
struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+ u64 file_offset,
+ u64 len);
int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..05d41e569236 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
struct backref_node {
struct rb_node rb_node;
u64 bytenr;
- /* objectid tree block owner */
+
+ u64 new_bytenr;
+ /* objectid of tree block owner, can be not uptodate */
u64 owner;
+ /* link to pending, changed or detached list */
+ struct list_head list;
/* list of upper level blocks reference this block */
struct list_head upper;
/* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
struct extent_buffer *eb;
/* level of tree block */
unsigned int level:8;
- /* 1 if the block is root of old snapshot */
- unsigned int old_root:1;
- /* 1 if no child blocks in the cache */
+ /* is the block in non-reference counted tree */
+ unsigned int cowonly:1;
+ /* 1 if no child node in the cache */
unsigned int lowest:1;
/* is the extent buffer locked */
unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
unsigned int processed:1;
/* have backrefs of this block been checked */
unsigned int checked:1;
+ /*
+ * 1 if corresponding block has been cowed but some upper
+ * level block pointers may not point to the new location
+ */
+ unsigned int pending:1;
+ /*
+ * 1 if the backref node isn't connected to any other
+ * backref node.
+ */
+ unsigned int detached:1;
};
/*
@@ -74,7 +88,6 @@ struct backref_node {
struct backref_edge {
struct list_head list[2];
struct backref_node *node[2];
- u64 blockptr;
};
#define LOWER 0
@@ -83,9 +96,25 @@ struct backref_edge {
struct backref_cache {
/* red black tree of all backref nodes in the cache */
struct rb_root rb_root;
- /* list of backref nodes with no child block in the cache */
+ /* for passing backref nodes to btrfs_reloc_cow_block */
+ struct backref_node *path[BTRFS_MAX_LEVEL];
+ /*
+ * list of blocks that have been cowed but some block
+ * pointers in upper level blocks may not reflect the
+ * new location
+ */
struct list_head pending[BTRFS_MAX_LEVEL];
- spinlock_t lock;
+ /* list of backref nodes with no child node */
+ struct list_head leaves;
+ /* list of blocks that have been cowed in current transaction */
+ struct list_head changed;
+ /* list of detached backref node. */
+ struct list_head detached;
+
+ u64 last_trans;
+
+ int nr_nodes;
+ int nr_edges;
};
/*
@@ -113,15 +142,6 @@ struct tree_block {
unsigned int key_ready:1;
};
-/* inode vector */
-#define INODEVEC_SIZE 16
-
-struct inodevec {
- struct list_head list;
- struct inode *inode[INODEVEC_SIZE];
- int nr;
-};
-
#define MAX_EXTENTS 128
struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
struct btrfs_root *extent_root;
/* inode for moving data */
struct inode *data_inode;
- struct btrfs_workers workers;
+
+ struct btrfs_block_rsv *block_rsv;
+
+ struct backref_cache backref_cache;
+
+ struct file_extent_cluster cluster;
/* tree blocks have been processed */
struct extent_io_tree processed_blocks;
/* map start of tree root to corresponding reloc tree */
struct mapping_tree reloc_root_tree;
/* list of reloc trees */
struct list_head reloc_roots;
+ /* size of metadata reservation for merging reloc trees */
+ u64 merging_rsv_size;
+ /* size of relocated tree nodes */
+ u64 nodes_relocated;
+
u64 search_start;
u64 extents_found;
- u64 extents_skipped;
- int stage;
- int create_reloc_root;
+
+ int block_rsv_retries;
+
+ unsigned int stage:8;
+ unsigned int create_reloc_tree:1;
+ unsigned int merge_reloc_tree:1;
unsigned int found_file_extent:1;
- unsigned int found_old_snapshot:1;
+ unsigned int commit_transaction:1;
};
/* stages of data relocation */
#define MOVE_DATA_EXTENTS 0
#define UPDATE_DATA_PTRS 1
-/*
- * merge reloc tree to corresponding fs tree in worker threads
- */
-struct async_merge {
- struct btrfs_work work;
- struct reloc_control *rc;
- struct btrfs_root *root;
- struct completion *done;
- atomic_t *num_pending;
-};
+static void remove_backref_node(struct backref_cache *cache,
+ struct backref_node *node);
+static void __mark_block_processed(struct reloc_control *rc,
+ struct backref_node *node);
static void mapping_tree_init(struct mapping_tree *tree)
{
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
cache->rb_root = RB_ROOT;
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
INIT_LIST_HEAD(&cache->pending[i]);
- spin_lock_init(&cache->lock);
+ INIT_LIST_HEAD(&cache->changed);
+ INIT_LIST_HEAD(&cache->detached);
+ INIT_LIST_HEAD(&cache->leaves);
+}
+
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+ struct backref_node *node;
+ int i;
+
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct backref_node, list);
+ remove_backref_node(cache, node);
+ }
+
+ while (!list_empty(&cache->leaves)) {
+ node = list_entry(cache->leaves.next,
+ struct backref_node, lower);
+ remove_backref_node(cache, node);
+ }
+
+ cache->last_trans = 0;
+
+ for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+ BUG_ON(!list_empty(&cache->pending[i]));
+ BUG_ON(!list_empty(&cache->changed));
+ BUG_ON(!list_empty(&cache->detached));
+ BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+ BUG_ON(cache->nr_nodes);
+ BUG_ON(cache->nr_edges);
+}
+
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+ struct backref_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_NOFS);
+ if (node) {
+ INIT_LIST_HEAD(&node->list);
+ INIT_LIST_HEAD(&node->upper);
+ INIT_LIST_HEAD(&node->lower);
+ RB_CLEAR_NODE(&node->rb_node);
+ cache->nr_nodes++;
+ }
+ return node;
+}
+
+static void free_backref_node(struct backref_cache *cache,
+ struct backref_node *node)
+{
+ if (node) {
+ cache->nr_nodes--;
+ kfree(node);
+ }
+}
+
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+ struct backref_edge *edge;
+
+ edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ if (edge)
+ cache->nr_edges++;
+ return edge;
}
-static void backref_node_init(struct backref_node *node)
+static void free_backref_edge(struct backref_cache *cache,
+ struct backref_edge *edge)
{
- memset(node, 0, sizeof(*node));
- INIT_LIST_HEAD(&node->upper);
- INIT_LIST_HEAD(&node->lower);
- RB_CLEAR_NODE(&node->rb_node);
+ if (edge) {
+ cache->nr_edges--;
+ kfree(edge);
+ }
}
static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
edges[idx++] = edge;
node = edge->node[UPPER];
}
+ BUG_ON(node->detached);
*index = idx;
return node;
}
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
return NULL;
}
+static void unlock_node_buffer(struct backref_node *node)
+{
+ if (node->locked) {
+ btrfs_tree_unlock(node->eb);
+ node->locked = 0;
+ }
+}
+
static void drop_node_buffer(struct backref_node *node)
{
if (node->eb) {
- if (node->locked) {
- btrfs_tree_unlock(node->eb);
- node->locked = 0;
- }
+ unlock_node_buffer(node);
free_extent_buffer(node->eb);
node->eb = NULL;
}
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
static void drop_backref_node(struct backref_cache *tree,
struct backref_node *node)
{
- BUG_ON(!node->lowest);
BUG_ON(!list_empty(&node->upper));
drop_node_buffer(node);
+ list_del(&node->list);
list_del(&node->lower);
-
- rb_erase(&node->rb_node, &tree->rb_root);
- kfree(node);
+ if (!RB_EMPTY_NODE(&node->rb_node))
+ rb_erase(&node->rb_node, &tree->rb_root);
+ free_backref_node(tree, node);
}
/*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
if (!node)
return;
- BUG_ON(!node->lowest);
+ BUG_ON(!node->lowest && !node->detached);
while (!list_empty(&node->upper)) {
edge = list_entry(node->upper.next, struct backref_edge,
list[LOWER]);
upper = edge->node[UPPER];
list_del(&edge->list[LOWER]);
list_del(&edge->list[UPPER]);
- kfree(edge);
+ free_backref_edge(cache, edge);
+
+ if (RB_EMPTY_NODE(&upper->rb_node)) {
+ BUG_ON(!list_empty(&node->upper));
+ drop_backref_node(cache, node);
+ node = upper;
+ node->lowest = 1;
+ continue;
+ }
/*
- * add the node to pending list if no other
+ * add the node to leaf node list if no other
* child block cached.
*/
if (list_empty(&upper->lower)) {
- list_add_tail(&upper->lower,
- &cache->pending[upper->level]);
+ list_add_tail(&upper->lower, &cache->leaves);
upper->lowest = 1;
}
}
+
drop_backref_node(cache, node);
}
+static void update_backref_node(struct backref_cache *cache,
+ struct backref_node *node, u64 bytenr)
+{
+ struct rb_node *rb_node;
+ rb_erase(&node->rb_node, &cache->rb_root);
+ node->bytenr = bytenr;
+ rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+ BUG_ON(rb_node);
+}
+
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+ struct backref_cache *cache)
+{
+ struct backref_node *node;
+ int level = 0;
+
+ if (cache->last_trans == 0) {
+ cache->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (cache->last_trans == trans->transid)
+ return 0;
+
+ /*
+ * detached nodes are used to avoid unnecessary backref
+ * lookup. transaction commit changes the extent tree.
+ * so the detached nodes are no longer useful.
+ */
+ while (!list_empty(&cache->detached)) {
+ node = list_entry(cache->detached.next,
+ struct backref_node, list);
+ remove_backref_node(cache, node);
+ }
+
+ while (!list_empty(&cache->changed)) {
+ node = list_entry(cache->changed.next,
+ struct backref_node, list);
+ list_del_init(&node->list);
+ BUG_ON(node->pending);
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+
+ /*
+ * some nodes can be left in the pending list if there were
+ * errors during processing the pending nodes.
+ */
+ for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ list_for_each_entry(node, &cache->pending[level], list) {
+ BUG_ON(!node->pending);
+ if (node->bytenr == node->new_bytenr)
+ continue;
+ update_backref_node(cache, node, node->new_bytenr);
+ }
+ }
+
+ cache->last_trans = 0;
+ return 1;
+}
+
+static int should_ignore_root(struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+
+ if (!root->ref_cows)
+ return 0;
+
+ reloc_root = root->reloc_root;
+ if (!reloc_root)
+ return 0;
+
+ if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+ root->fs_info->running_transaction->transid - 1)
+ return 0;
+ /*
+ * if there is reloc tree and it was created in previous
+ * transaction backref lookup can find the reloc tree,
+ * so backref node for the fs tree root is useless for
+ * relocation.
+ */
+ return 1;
+}
+
/*
* find reloc tree by address of tree root
*/
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
* for all upper level blocks that directly/indirectly reference the
* block are also cached.
*/
-static struct backref_node *build_backref_tree(struct reloc_control *rc,
- struct backref_cache *cache,
- struct btrfs_key *node_key,
- int level, u64 bytenr)
+static noinline_for_stack
+struct backref_node *build_backref_tree(struct reloc_control *rc,
+ struct btrfs_key *node_key,
+ int level, u64 bytenr)
{
+ struct backref_cache *cache = &rc->backref_cache;
struct btrfs_path *path1;
struct btrfs_path *path2;
struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
unsigned long end;
unsigned long ptr;
LIST_HEAD(list);
+ LIST_HEAD(useless);
+ int cowonly;
int ret;
int err = 0;
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
goto out;
}
- node = kmalloc(sizeof(*node), GFP_NOFS);
+ node = alloc_backref_node(cache);
if (!node) {
err = -ENOMEM;
goto out;
}
- backref_node_init(node);
node->bytenr = bytenr;
- node->owner = 0;
node->level = level;
node->lowest = 1;
cur = node;
@@ -587,17 +780,20 @@ again:
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
key.type == BTRFS_EXTENT_REF_V0_KEY) {
- if (key.objectid == key.offset &&
- key.type == BTRFS_EXTENT_REF_V0_KEY) {
+ if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
struct btrfs_extent_ref_v0 *ref0;
ref0 = btrfs_item_ptr(eb, path1->slots[0],
struct btrfs_extent_ref_v0);
root = find_tree_root(rc, eb, ref0);
- if (root)
- cur->root = root;
- else
- cur->old_root = 1;
- break;
+ if (!root->ref_cows)
+ cur->cowonly = 1;
+ if (key.objectid == key.offset) {
+ if (root && !should_ignore_root(root))
+ cur->root = root;
+ else
+ list_add(&cur->list, &useless);
+ break;
+ }
}
#else
BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +810,20 @@ again:
break;
}
- edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ edge = alloc_backref_edge(cache);
if (!edge) {
err = -ENOMEM;
goto out;
}
rb_node = tree_search(&cache->rb_root, key.offset);
if (!rb_node) {
- upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ upper = alloc_backref_node(cache);
if (!upper) {
- kfree(edge);
+ free_backref_edge(cache, edge);
err = -ENOMEM;
goto out;
}
- backref_node_init(upper);
upper->bytenr = key.offset;
- upper->owner = 0;
upper->level = cur->level + 1;
/*
* backrefs for the upper level block isn't
@@ -639,11 +833,12 @@ again:
} else {
upper = rb_entry(rb_node, struct backref_node,
rb_node);
+ BUG_ON(!upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
- list_add(&edge->list[LOWER], &cur->upper);
- edge->node[UPPER] = upper;
+ list_add_tail(&edge->list[LOWER], &cur->upper);
edge->node[LOWER] = cur;
+ edge->node[UPPER] = upper;
goto next;
} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +852,17 @@ again:
goto out;
}
+ if (!root->ref_cows)
+ cur->cowonly = 1;
+
if (btrfs_root_level(&root->root_item) == cur->level) {
/* tree root */
BUG_ON(btrfs_root_bytenr(&root->root_item) !=
cur->bytenr);
- cur->root = root;
+ if (should_ignore_root(root))
+ list_add(&cur->list, &useless);
+ else
+ cur->root = root;
break;
}
@@ -692,11 +893,14 @@ again:
if (!path2->nodes[level]) {
BUG_ON(btrfs_root_bytenr(&root->root_item) !=
lower->bytenr);
- lower->root = root;
+ if (should_ignore_root(root))
+ list_add(&lower->list, &useless);
+ else
+ lower->root = root;
break;
}
- edge = kzalloc(sizeof(*edge), GFP_NOFS);
+ edge = alloc_backref_edge(cache);
if (!edge) {
err = -ENOMEM;
goto out;
@@ -705,16 +909,17 @@ again:
eb = path2->nodes[level];
rb_node = tree_search(&cache->rb_root, eb->start);
if (!rb_node) {
- upper = kmalloc(sizeof(*upper), GFP_NOFS);
+ upper = alloc_backref_node(cache);
if (!upper) {
- kfree(edge);
+ free_backref_edge(cache, edge);
err = -ENOMEM;
goto out;
}
- backref_node_init(upper);
upper->bytenr = eb->start;
upper->owner = btrfs_header_owner(eb);
upper->level = lower->level + 1;
+ if (!root->ref_cows)
+ upper->cowonly = 1;
/*
* if we know the block isn't shared
@@ -744,10 +949,12 @@ again:
rb_node);
BUG_ON(!upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
+ if (!upper->owner)
+ upper->owner = btrfs_header_owner(eb);
}
list_add_tail(&edge->list[LOWER], &lower->upper);
- edge->node[UPPER] = upper;
edge->node[LOWER] = lower;
+ edge->node[UPPER] = upper;
if (rb_node)
break;
@@ -785,8 +992,13 @@ next:
* into the cache.
*/
BUG_ON(!node->checked);
- rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
- BUG_ON(rb_node);
+ cowonly = node->cowonly;
+ if (!cowonly) {
+ rb_node = tree_insert(&cache->rb_root, node->bytenr,
+ &node->rb_node);
+ BUG_ON(rb_node);
+ list_add_tail(&node->lower, &cache->leaves);
+ }
list_for_each_entry(edge, &node->upper, list[LOWER])
list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1007,14 @@ next:
edge = list_entry(list.next, struct backref_edge, list[UPPER]);
list_del_init(&edge->list[UPPER]);
upper = edge->node[UPPER];
+ if (upper->detached) {
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ free_backref_edge(cache, edge);
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, &useless);
+ continue;
+ }
if (!RB_EMPTY_NODE(&upper->rb_node)) {
if (upper->lowest) {
@@ -807,25 +1027,69 @@ next:
}
BUG_ON(!upper->checked);
- rb_node = tree_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
- BUG_ON(rb_node);
+ BUG_ON(cowonly != upper->cowonly);
+ if (!cowonly) {
+ rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+ &upper->rb_node);
+ BUG_ON(rb_node);
+ }
list_add_tail(&edge->list[UPPER], &upper->lower);
list_for_each_entry(edge, &upper->upper, list[LOWER])
list_add_tail(&edge->list[UPPER], &list);
}
+ /*
+ * process useless backref nodes. backref nodes for tree leaves
+ * are deleted from the cache. backref nodes for upper level
+ * tree blocks are left in the cache to avoid unnecessary backref
+ * lookup.
+ */
+ while (!list_empty(&useless)) {
+ upper = list_entry(useless.next, struct backref_node, list);
+ list_del_init(&upper->list);
+ BUG_ON(!list_empty(&upper->upper));
+ if (upper == node)
+ node = NULL;
+ if (upper->lowest) {
+ list_del_init(&upper->lower);
+ upper->lowest = 0;
+ }
+ while (!list_empty(&upper->lower)) {
+ edge = list_entry(upper->lower.next,
+ struct backref_edge, list[UPPER]);
+ list_del(&edge->list[UPPER]);
+ list_del(&edge->list[LOWER]);
+ lower = edge->node[LOWER];
+ free_backref_edge(cache, edge);
+
+ if (list_empty(&lower->upper))
+ list_add(&lower->list, &useless);
+ }
+ __mark_block_processed(rc, upper);
+ if (upper->level > 0) {
+ list_add(&upper->list, &cache->detached);
+ upper->detached = 1;
+ } else {
+ rb_erase(&upper->rb_node, &cache->rb_root);
+ free_backref_node(cache, upper);
+ }
+ }
out:
btrfs_free_path(path1);
btrfs_free_path(path2);
if (err) {
- INIT_LIST_HEAD(&list);
+ while (!list_empty(&useless)) {
+ lower = list_entry(useless.next,
+ struct backref_node, upper);
+ list_del_init(&lower->upper);
+ }
upper = node;
+ INIT_LIST_HEAD(&list);
while (upper) {
if (RB_EMPTY_NODE(&upper->rb_node)) {
list_splice_tail(&upper->upper, &list);
- kfree(upper);
+ free_backref_node(cache, upper);
}
if (list_empty(&list))
@@ -833,15 +1097,104 @@ out:
edge = list_entry(list.next, struct backref_edge,
list[LOWER]);
+ list_del(&edge->list[LOWER]);
upper = edge->node[UPPER];
- kfree(edge);
+ free_backref_edge(cache, edge);
}
return ERR_PTR(err);
}
+ BUG_ON(node && node->detached);
return node;
}
/*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *src,
+ struct btrfs_root *dest)
+{
+ struct btrfs_root *reloc_root = src->reloc_root;
+ struct backref_cache *cache = &rc->backref_cache;
+ struct backref_node *node = NULL;
+ struct backref_node *new_node;
+ struct backref_edge *edge;
+ struct backref_edge *new_edge;
+ struct rb_node *rb_node;
+
+ if (cache->last_trans > 0)
+ update_backref_cache(trans, cache);
+
+ rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct backref_node, rb_node);
+ if (node->detached)
+ node = NULL;
+ else
+ BUG_ON(node->new_bytenr != reloc_root->node->start);
+ }
+
+ if (!node) {
+ rb_node = tree_search(&cache->rb_root,
+ reloc_root->commit_root->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct backref_node,
+ rb_node);
+ BUG_ON(node->detached);
+ }
+ }
+
+ if (!node)
+ return 0;
+
+ new_node = alloc_backref_node(cache);
+ if (!new_node)
+ return -ENOMEM;
+
+ new_node->bytenr = dest->node->start;
+ new_node->level = node->level;
+ new_node->lowest = node->lowest;
+ new_node->root = dest;
+
+ if (!node->lowest) {
+ list_for_each_entry(edge, &node->lower, list[UPPER]) {
+ new_edge = alloc_backref_edge(cache);
+ if (!new_edge)
+ goto fail;
+
+ new_edge->node[UPPER] = new_node;
+ new_edge->node[LOWER] = edge->node[LOWER];
+ list_add_tail(&new_edge->list[UPPER],
+ &new_node->lower);
+ }
+ }
+
+ rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+ &new_node->rb_node);
+ BUG_ON(rb_node);
+
+ if (!new_node->lowest) {
+ list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+ list_add_tail(&new_edge->list[LOWER],
+ &new_edge->node[LOWER]->upper);
+ }
+ }
+ return 0;
+fail:
+ while (!list_empty(&new_node->lower)) {
+ new_edge = list_entry(new_node->lower.next,
+ struct backref_edge, list[UPPER]);
+ list_del(&new_edge->list[UPPER]);
+ free_backref_edge(cache, new_edge);
+ }
+ free_backref_node(cache, new_node);
+ return -ENOMEM;
+}
+
+/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
return 0;
}
-/*
- * create reloc tree for a given fs tree. reloc tree is just a
- * snapshot of the fs tree with special root objectid.
- */
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
{
struct btrfs_root *reloc_root;
struct extent_buffer *eb;
@@ -914,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_key root_key;
int ret;
- if (root->reloc_root) {
- reloc_root = root->reloc_root;
- reloc_root->last_trans = trans->transid;
- return 0;
- }
-
- if (!root->fs_info->reloc_ctl ||
- !root->fs_info->reloc_ctl->create_reloc_root ||
- root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
- return 0;
-
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
BUG_ON(!root_item);
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = root->root_key.objectid;
+ root_key.offset = objectid;
- ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
- BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (root->root_key.objectid == objectid) {
+ /* called by btrfs_init_reloc_root */
+ ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+
+ btrfs_set_root_last_snapshot(&root->root_item,
+ trans->transid - 1);
+ } else {
+ /*
+ * called by btrfs_reloc_post_snapshot_hook.
+ * the source tree is a reloc tree, all tree blocks
+ * modified after it was created have RELOC flag
+ * set in their headers. so it's OK to not update
+ * the 'last_snapshot'.
+ */
+ ret = btrfs_copy_root(trans, root, root->node, &eb,
+ BTRFS_TREE_RELOC_OBJECTID);
+ BUG_ON(ret);
+ }
- btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
memcpy(root_item, &root->root_item, sizeof(*root_item));
- btrfs_set_root_refs(root_item, 1);
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
btrfs_set_root_generation(root_item, trans->transid);
- memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
- root_item->drop_level = 0;
+
+ if (root->root_key.objectid == objectid) {
+ btrfs_set_root_refs(root_item, 0);
+ memset(&root_item->drop_progress, 0,
+ sizeof(struct btrfs_disk_key));
+ root_item->drop_level = 0;
+ }
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -957,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
&root_key);
BUG_ON(IS_ERR(reloc_root));
reloc_root->last_trans = trans->transid;
+ return reloc_root;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_root *reloc_root;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+ int clear_rsv = 0;
+
+ if (root->reloc_root) {
+ reloc_root = root->reloc_root;
+ reloc_root->last_trans = trans->transid;
+ return 0;
+ }
+
+ if (!rc || !rc->create_reloc_tree ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+ return 0;
+
+ if (!trans->block_rsv) {
+ trans->block_rsv = rc->block_rsv;
+ clear_rsv = 1;
+ }
+ reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+ if (clear_rsv)
+ trans->block_rsv = NULL;
__add_reloc_root(reloc_root);
root->reloc_root = reloc_root;
@@ -980,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
- if (btrfs_root_refs(root_item) == 0) {
+ if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+ btrfs_root_refs(root_item) == 0) {
root->reloc_root = NULL;
del = 1;
}
@@ -1102,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
goto out;
}
- if (new_bytenr)
- *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
ret = 0;
out:
btrfs_free_path(path);
@@ -1114,19 +1503,18 @@ out:
* update file extent items in the tree leaf to point to
* the new locations.
*/
-static int replace_file_extents(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct btrfs_root *root,
- struct extent_buffer *leaf,
- struct list_head *inode_list)
+static noinline_for_stack
+int replace_file_extents(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root,
+ struct extent_buffer *leaf)
{
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
struct inode *inode = NULL;
- struct inodevec *ivec = NULL;
u64 parent;
u64 bytenr;
- u64 new_bytenr;
+ u64 new_bytenr = 0;
u64 num_bytes;
u64 end;
u32 nritems;
@@ -1166,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
* to complete and drop the extent cache
*/
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
- if (!ivec || ivec->nr == INODEVEC_SIZE) {
- ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
- BUG_ON(!ivec);
- ivec->nr = 0;
- list_add_tail(&ivec->list, inode_list);
- }
if (first) {
inode = find_next_inode(root, key.objectid);
- if (inode)
- ivec->inode[ivec->nr++] = inode;
first = 0;
} else if (inode && inode->i_ino < key.objectid) {
+ btrfs_add_delayed_iput(inode);
inode = find_next_inode(root, key.objectid);
- if (inode)
- ivec->inode[ivec->nr++] = inode;
}
if (inode && inode->i_ino == key.objectid) {
end = key.offset +
@@ -1204,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
ret = get_new_location(rc->data_inode, &new_bytenr,
bytenr, num_bytes);
- if (ret > 0)
+ if (ret > 0) {
+ WARN_ON(1);
continue;
+ }
BUG_ON(ret < 0);
btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
}
if (dirty)
btrfs_mark_buffer_dirty(leaf);
+ if (inode)
+ btrfs_add_delayed_iput(inode);
return 0;
}
@@ -1248,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
* if no block got replaced, 0 is returned. if there are other
* errors, a negative error number is returned.
*/
-static int replace_path(struct btrfs_trans_handle *trans,
- struct btrfs_root *dest, struct btrfs_root *src,
- struct btrfs_path *path, struct btrfs_key *next_key,
- struct extent_buffer **leaf,
- int lowest_level, int max_level)
+static noinline_for_stack
+int replace_path(struct btrfs_trans_handle *trans,
+ struct btrfs_root *dest, struct btrfs_root *src,
+ struct btrfs_path *path, struct btrfs_key *next_key,
+ int lowest_level, int max_level)
{
struct extent_buffer *eb;
struct extent_buffer *parent;
@@ -1263,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
u64 new_ptr_gen;
u64 last_snapshot;
u32 blocksize;
+ int cow = 0;
int level;
int ret;
int slot;
BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(lowest_level > 1 && leaf);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
-
+again:
slot = path->slots[lowest_level];
btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
@@ -1286,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
return 0;
}
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
- BUG_ON(ret);
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ BUG_ON(ret);
+ }
btrfs_set_lock_blocking(eb);
if (next_key) {
@@ -1331,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
memcmp_node_keys(parent, slot, path, level)) {
- if (level <= lowest_level && !leaf) {
+ if (level <= lowest_level) {
ret = 0;
break;
}
@@ -1339,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
eb = read_tree_block(dest, old_bytenr, blocksize,
old_ptr_gen);
btrfs_tree_lock(eb);
- ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
- BUG_ON(ret);
- btrfs_set_lock_blocking(eb);
-
- if (level <= lowest_level) {
- *leaf = eb;
- ret = 0;
- break;
+ if (cow) {
+ ret = btrfs_cow_block(trans, dest, eb, parent,
+ slot, &eb);
+ BUG_ON(ret);
}
+ btrfs_set_lock_blocking(eb);
btrfs_tree_unlock(parent);
free_extent_buffer(parent);
@@ -1357,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
continue;
}
+ if (!cow) {
+ btrfs_tree_unlock(parent);
+ free_extent_buffer(parent);
+ cow = 1;
+ goto again;
+ }
+
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
btrfs_release_path(src, path);
@@ -1562,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
return 0;
}
-static void put_inodes(struct list_head *list)
-{
- struct inodevec *ivec;
- while (!list_empty(list)) {
- ivec = list_entry(list->next, struct inodevec, list);
- list_del(&ivec->list);
- while (ivec->nr > 0) {
- ivec->nr--;
- iput(ivec->inode[ivec->nr]);
- }
- kfree(ivec);
- }
-}
-
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key)
@@ -1608,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
struct btrfs_root *reloc_root;
struct btrfs_root_item *root_item;
struct btrfs_path *path;
- struct extent_buffer *leaf = NULL;
+ struct extent_buffer *leaf;
unsigned long nr;
int level;
int max_level;
int replaced = 0;
int ret;
int err = 0;
+ u32 min_reserved;
path = btrfs_alloc_path();
if (!path)
@@ -1648,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_unlock_up_safe(path, 0);
}
- if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
- trans = btrfs_start_transaction(root, 1);
+ min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ memset(&next_key, 0, sizeof(next_key));
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, 0);
- btrfs_release_path(reloc_root, path);
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ trans->block_rsv = rc->block_rsv;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0) {
- err = ret;
- goto out;
+ ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
+ min_reserved, 0);
+ if (ret) {
+ BUG_ON(ret != -EAGAIN);
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ continue;
}
- leaf = path->nodes[0];
- btrfs_unlock_up_safe(path, 1);
- ret = replace_file_extents(trans, rc, root, leaf,
- &inode_list);
- if (ret < 0)
- err = ret;
- goto out;
- }
-
- memset(&next_key, 0, sizeof(next_key));
-
- while (1) {
- leaf = NULL;
replaced = 0;
- trans = btrfs_start_transaction(root, 1);
max_level = level;
ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
if (!find_next_key(path, level, &key) &&
btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
ret = 0;
- } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
- ret = replace_path(trans, root, reloc_root,
- path, &next_key, &leaf,
- level, max_level);
} else {
- ret = replace_path(trans, root, reloc_root,
- path, &next_key, NULL,
- level, max_level);
+ ret = replace_path(trans, root, reloc_root, path,
+ &next_key, level, max_level);
}
if (ret < 0) {
err = ret;
@@ -1708,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
replaced = 1;
- } else if (leaf) {
- /*
- * no block got replaced, try replacing file extents
- */
- btrfs_item_key_to_cpu(leaf, &key, 0);
- ret = replace_file_extents(trans, rc, root, leaf,
- &inode_list);
- btrfs_tree_unlock(leaf);
- free_extent_buffer(leaf);
- BUG_ON(ret < 0);
}
ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
root_item->drop_level = level;
nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root, nr);
- /*
- * put inodes outside transaction, otherwise we may deadlock.
- */
- put_inodes(&inode_list);
-
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
}
@@ -1765,87 +2109,125 @@ out:
sizeof(root_item->drop_progress));
root_item->drop_level = 0;
btrfs_set_root_refs(root_item, 0);
+ btrfs_update_reloc_root(trans, root);
}
nr = trans->blocks_used;
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root, nr);
- put_inodes(&inode_list);
-
if (replaced && rc->stage == UPDATE_DATA_PTRS)
invalidate_extent_cache(root, &key, &next_key);
return err;
}
-/*
- * callback for the work threads.
- * this function merges reloc tree with corresponding fs tree,
- * and then drops the reloc tree.
- */
-static void merge_func(struct btrfs_work *work)
+static noinline_for_stack
+int prepare_to_merge(struct reloc_control *rc, int err)
{
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root;
+ struct btrfs_root *root = rc->extent_root;
struct btrfs_root *reloc_root;
- struct async_merge *async;
+ struct btrfs_trans_handle *trans;
+ LIST_HEAD(reloc_roots);
+ u64 num_bytes = 0;
+ int ret;
+ int retries = 0;
+
+ mutex_lock(&root->fs_info->trans_mutex);
+ rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ rc->merging_rsv_size += rc->nodes_relocated * 2;
+ mutex_unlock(&root->fs_info->trans_mutex);
+again:
+ if (!err) {
+ num_bytes = rc->merging_rsv_size;
+ ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+ num_bytes, &retries);
+ if (ret)
+ err = ret;
+ }
+
+ trans = btrfs_join_transaction(rc->extent_root, 1);
+
+ if (!err) {
+ if (num_bytes != rc->merging_rsv_size) {
+ btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_block_rsv_release(rc->extent_root,
+ rc->block_rsv, num_bytes);
+ retries = 0;
+ goto again;
+ }
+ }
- async = container_of(work, struct async_merge, work);
- reloc_root = async->root;
+ rc->merge_reloc_tree = 1;
+
+ while (!list_empty(&rc->reloc_roots)) {
+ reloc_root = list_entry(rc->reloc_roots.next,
+ struct btrfs_root, root_list);
+ list_del_init(&reloc_root->root_list);
- if (btrfs_root_refs(&reloc_root->root_item) > 0) {
root = read_fs_root(reloc_root->fs_info,
reloc_root->root_key.offset);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
- merge_reloc_root(async->rc, root);
-
- trans = btrfs_start_transaction(root, 1);
+ /*
+ * set reference count to 1, so btrfs_recover_relocation
+ * knows it should resumes merging
+ */
+ if (!err)
+ btrfs_set_root_refs(&reloc_root->root_item, 1);
btrfs_update_reloc_root(trans, root);
- btrfs_end_transaction(trans, root);
- }
- btrfs_drop_snapshot(reloc_root, 0);
+ list_add(&reloc_root->root_list, &reloc_roots);
+ }
- if (atomic_dec_and_test(async->num_pending))
- complete(async->done);
+ list_splice(&reloc_roots, &rc->reloc_roots);
- kfree(async);
+ if (!err)
+ btrfs_commit_transaction(trans, rc->extent_root);
+ else
+ btrfs_end_transaction(trans, rc->extent_root);
+ return err;
}
-static int merge_reloc_roots(struct reloc_control *rc)
+static noinline_for_stack
+int merge_reloc_roots(struct reloc_control *rc)
{
- struct async_merge *async;
struct btrfs_root *root;
- struct completion done;
- atomic_t num_pending;
+ struct btrfs_root *reloc_root;
+ LIST_HEAD(reloc_roots);
+ int found = 0;
+ int ret;
+again:
+ root = rc->extent_root;
+ mutex_lock(&root->fs_info->trans_mutex);
+ list_splice_init(&rc->reloc_roots, &reloc_roots);
+ mutex_unlock(&root->fs_info->trans_mutex);
- init_completion(&done);
- atomic_set(&num_pending, 1);
+ while (!list_empty(&reloc_roots)) {
+ found = 1;
+ reloc_root = list_entry(reloc_roots.next,
+ struct btrfs_root, root_list);
- while (!list_empty(&rc->reloc_roots)) {
- root = list_entry(rc->reloc_roots.next,
- struct btrfs_root, root_list);
- list_del_init(&root->root_list);
+ if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+ root = read_fs_root(reloc_root->fs_info,
+ reloc_root->root_key.offset);
+ BUG_ON(IS_ERR(root));
+ BUG_ON(root->reloc_root != reloc_root);
- async = kmalloc(sizeof(*async), GFP_NOFS);
- BUG_ON(!async);
- async->work.func = merge_func;
- async->work.flags = 0;
- async->rc = rc;
- async->root = root;
- async->done = &done;
- async->num_pending = &num_pending;
- atomic_inc(&num_pending);
- btrfs_queue_worker(&rc->workers, &async->work);
+ ret = merge_reloc_root(rc, root);
+ BUG_ON(ret);
+ } else {
+ list_del_init(&reloc_root->root_list);
+ }
+ btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
}
- if (!atomic_dec_and_test(&num_pending))
- wait_for_completion(&done);
-
+ if (found) {
+ found = 0;
+ goto again;
+ }
BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
return 0;
}
@@ -1876,119 +2258,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
return btrfs_record_root_in_trans(trans, root);
}
-/*
- * select one tree from trees that references the block.
- * for blocks in refernce counted trees, we preper reloc tree.
- * if no reloc tree found and reloc_only is true, NULL is returned.
- */
-static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
- struct backref_node *node,
- struct backref_edge *edges[],
- int *nr, int reloc_only)
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node,
+ struct backref_edge *edges[], int *nr)
{
struct backref_node *next;
struct btrfs_root *root;
- int index;
- int loop = 0;
-again:
- index = 0;
+ int index = 0;
+
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
- if (!root) {
- BUG_ON(!node->old_root);
- goto skip;
- }
-
- /* no other choice for non-refernce counted tree */
- if (!root->ref_cows) {
- BUG_ON(reloc_only);
- break;
- }
+ BUG_ON(!root);
+ BUG_ON(!root->ref_cows);
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
record_reloc_root_in_trans(trans, root);
break;
}
- if (loop) {
- btrfs_record_root_in_trans(trans, root);
+ btrfs_record_root_in_trans(trans, root);
+ root = root->reloc_root;
+
+ if (next->new_bytenr != root->node->start) {
+ BUG_ON(next->new_bytenr);
+ BUG_ON(!list_empty(&next->list));
+ next->new_bytenr = root->node->start;
+ next->root = root;
+ list_add_tail(&next->list,
+ &rc->backref_cache.changed);
+ __mark_block_processed(rc, next);
break;
}
- if (reloc_only || next != node) {
- if (!root->reloc_root)
- btrfs_record_root_in_trans(trans, root);
- root = root->reloc_root;
- /*
- * if the reloc tree was created in current
- * transation, there is no node in backref tree
- * corresponds to the root of the reloc tree.
- */
- if (btrfs_root_last_snapshot(&root->root_item) ==
- trans->transid - 1)
- break;
- }
-skip:
+ WARN_ON(1);
root = NULL;
next = walk_down_backref(edges, &index);
if (!next || next->level <= node->level)
break;
}
+ if (!root)
+ return NULL;
- if (!root && !loop && !reloc_only) {
- loop = 1;
- goto again;
+ *nr = index;
+ next = node;
+ /* setup backref node path for btrfs_reloc_cow_block */
+ while (1) {
+ rc->backref_cache.path[next->level] = next;
+ if (--index < 0)
+ break;
+ next = edges[index]->node[UPPER];
}
-
- if (root)
- *nr = index;
- else
- *nr = 0;
-
return root;
}
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
static noinline_for_stack
struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
struct backref_node *node)
{
+ struct backref_node *next;
+ struct btrfs_root *root;
+ struct btrfs_root *fs_root = NULL;
struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
- int nr;
- return __select_one_root(trans, node, edges, &nr, 0);
+ int index = 0;
+
+ next = node;
+ while (1) {
+ cond_resched();
+ next = walk_up_backref(next, edges, &index);
+ root = next->root;
+ BUG_ON(!root);
+
+ /* no other choice for non-refernce counted tree */
+ if (!root->ref_cows)
+ return root;
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+ fs_root = root;
+
+ if (next != node)
+ return NULL;
+
+ next = walk_down_backref(edges, &index);
+ if (!next || next->level <= node->level)
+ break;
+ }
+
+ if (!fs_root)
+ return ERR_PTR(-ENOENT);
+ return fs_root;
}
static noinline_for_stack
-struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
- struct backref_node *node,
- struct backref_edge *edges[], int *nr)
+u64 calcu_metadata_size(struct reloc_control *rc,
+ struct backref_node *node, int reserve)
{
- return __select_one_root(trans, node, edges, nr, 1);
+ struct backref_node *next = node;
+ struct backref_edge *edge;
+ struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+ u64 num_bytes = 0;
+ int index = 0;
+
+ BUG_ON(reserve && node->processed);
+
+ while (next) {
+ cond_resched();
+ while (1) {
+ if (next->processed && (reserve || next != node))
+ break;
+
+ num_bytes += btrfs_level_size(rc->extent_root,
+ next->level);
+
+ if (list_empty(&next->upper))
+ break;
+
+ edge = list_entry(next->upper.next,
+ struct backref_edge, list[LOWER]);
+ edges[index++] = edge;
+ next = edge->node[UPPER];
+ }
+ next = walk_down_backref(edges, &index);
+ }
+ return num_bytes;
}
-static void grab_path_buffers(struct btrfs_path *path,
- struct backref_node *node,
- struct backref_edge *edges[], int nr)
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct backref_node *node)
{
- int i = 0;
- while (1) {
- drop_node_buffer(node);
- node->eb = path->nodes[node->level];
- BUG_ON(!node->eb);
- if (path->locks[node->level])
- node->locked = 1;
- path->nodes[node->level] = NULL;
- path->locks[node->level] = 0;
-
- if (i >= nr)
- break;
+ struct btrfs_root *root = rc->extent_root;
+ u64 num_bytes;
+ int ret;
+
+ num_bytes = calcu_metadata_size(rc, node, 1) * 2;
- edges[i]->blockptr = node->eb->start;
- node = edges[i]->node[UPPER];
- i++;
+ trans->block_rsv = rc->block_rsv;
+ ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
+ &rc->block_rsv_retries);
+ if (ret) {
+ if (ret == -EAGAIN)
+ rc->commit_transaction = 1;
+ return ret;
}
+
+ rc->block_rsv_retries = 0;
+ return 0;
+}
+
+static void release_metadata_space(struct reloc_control *rc,
+ struct backref_node *node)
+{
+ u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
}
/*
@@ -1999,6 +2431,7 @@ static void grab_path_buffers(struct btrfs_path *path,
* in that case this function just updates pointers.
*/
static int do_relocation(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
struct backref_node *node,
struct btrfs_key *key,
struct btrfs_path *path, int lowest)
@@ -2019,18 +2452,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
BUG_ON(lowest && node->eb);
path->lowest_level = node->level + 1;
+ rc->backref_cache.path[node->level] = node;
list_for_each_entry(edge, &node->upper, list[LOWER]) {
cond_resched();
- if (node->eb && node->eb->start == edge->blockptr)
- continue;
upper = edge->node[UPPER];
- root = select_reloc_root(trans, upper, edges, &nr);
- if (!root)
- continue;
-
- if (upper->eb && !upper->locked)
+ root = select_reloc_root(trans, rc, upper, edges, &nr);
+ BUG_ON(!root);
+
+ if (upper->eb && !upper->locked) {
+ if (!lowest) {
+ ret = btrfs_bin_search(upper->eb, key,
+ upper->level, &slot);
+ BUG_ON(ret);
+ bytenr = btrfs_node_blockptr(upper->eb, slot);
+ if (node->eb->start == bytenr)
+ goto next;
+ }
drop_node_buffer(upper);
+ }
if (!upper->eb) {
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2480,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
}
BUG_ON(ret > 0);
- slot = path->slots[upper->level];
+ if (!upper->eb) {
+ upper->eb = path->nodes[upper->level];
+ path->nodes[upper->level] = NULL;
+ } else {
+ BUG_ON(upper->eb != path->nodes[upper->level]);
+ }
- btrfs_unlock_up_safe(path, upper->level + 1);
- grab_path_buffers(path, upper, edges, nr);
+ upper->locked = 1;
+ path->locks[upper->level] = 0;
+ slot = path->slots[upper->level];
btrfs_release_path(NULL, path);
} else {
ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2499,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(upper->eb, slot);
- if (!lowest) {
- if (node->eb->start == bytenr) {
- btrfs_tree_unlock(upper->eb);
- upper->locked = 0;
- continue;
- }
+ if (lowest) {
+ BUG_ON(bytenr != node->bytenr);
} else {
- BUG_ON(node->bytenr != bytenr);
+ if (node->eb->start == bytenr)
+ goto next;
}
blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2515,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
slot, &eb);
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
if (ret < 0) {
err = ret;
- break;
+ goto next;
}
- btrfs_set_lock_blocking(eb);
- node->eb = eb;
- node->locked = 1;
+ BUG_ON(node->eb != eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
@@ -2096,67 +2539,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
BUG_ON(ret);
}
- if (!lowest) {
- btrfs_tree_unlock(upper->eb);
- upper->locked = 0;
- }
+next:
+ if (!upper->pending)
+ drop_node_buffer(upper);
+ else
+ unlock_node_buffer(upper);
+ if (err)
+ break;
}
+
+ if (!err && node->pending) {
+ drop_node_buffer(node);
+ list_move_tail(&node->list, &rc->backref_cache.changed);
+ node->pending = 0;
+ }
+
path->lowest_level = 0;
+ BUG_ON(err == -ENOSPC);
return err;
}
static int link_to_upper(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
struct backref_node *node,
struct btrfs_path *path)
{
struct btrfs_key key;
- if (!node->eb || list_empty(&node->upper))
- return 0;
btrfs_node_key_to_cpu(node->eb, &key, 0);
- return do_relocation(trans, node, &key, path, 0);
+ return do_relocation(trans, rc, node, &key, path, 0);
}
static int finish_pending_nodes(struct btrfs_trans_handle *trans,
- struct backref_cache *cache,
- struct btrfs_path *path)
+ struct reloc_control *rc,
+ struct btrfs_path *path, int err)
{
+ LIST_HEAD(list);
+ struct backref_cache *cache = &rc->backref_cache;
struct backref_node *node;
int level;
int ret;
- int err = 0;
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
while (!list_empty(&cache->pending[level])) {
node = list_entry(cache->pending[level].next,
- struct backref_node, lower);
- BUG_ON(node->level != level);
+ struct backref_node, list);
+ list_move_tail(&node->list, &list);
+ BUG_ON(!node->pending);
- ret = link_to_upper(trans, node, path);
- if (ret < 0)
- err = ret;
- /*
- * this remove the node from the pending list and
- * may add some other nodes to the level + 1
- * pending list
- */
- remove_backref_node(cache, node);
+ if (!err) {
+ ret = link_to_upper(trans, rc, node, path);
+ if (ret < 0)
+ err = ret;
+ }
}
+ list_splice_init(&list, &cache->pending[level]);
}
- BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
return err;
}
static void mark_block_processed(struct reloc_control *rc,
- struct backref_node *node)
+ u64 bytenr, u32 blocksize)
+{
+ set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+ EXTENT_DIRTY, GFP_NOFS);
+}
+
+static void __mark_block_processed(struct reloc_control *rc,
+ struct backref_node *node)
{
u32 blocksize;
if (node->level == 0 ||
in_block_group(node->bytenr, rc->block_group)) {
blocksize = btrfs_level_size(rc->extent_root, node->level);
- set_extent_bits(&rc->processed_blocks, node->bytenr,
- node->bytenr + blocksize - 1, EXTENT_DIRTY,
- GFP_NOFS);
+ mark_block_processed(rc, node->bytenr, blocksize);
}
node->processed = 1;
}
@@ -2179,7 +2635,7 @@ static void update_processed_blocks(struct reloc_control *rc,
if (next->processed)
break;
- mark_block_processed(rc, next);
+ __mark_block_processed(rc, next);
if (list_empty(&next->upper))
break;
@@ -2202,138 +2658,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
return 0;
}
-/*
- * check if there are any file extent pointers in the leaf point to
- * data require processing
- */
-static int check_file_extents(struct reloc_control *rc,
- u64 bytenr, u32 blocksize, u64 ptr_gen)
-{
- struct btrfs_key found_key;
- struct btrfs_file_extent_item *fi;
- struct extent_buffer *leaf;
- u32 nritems;
- int i;
- int ret = 0;
-
- leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
-
- nritems = btrfs_header_nritems(leaf);
- for (i = 0; i < nritems; i++) {
- cond_resched();
- btrfs_item_key_to_cpu(leaf, &found_key, i);
- if (found_key.type != BTRFS_EXTENT_DATA_KEY)
- continue;
- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- continue;
- bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
- if (bytenr == 0)
- continue;
- if (in_block_group(bytenr, rc->block_group)) {
- ret = 1;
- break;
- }
- }
- free_extent_buffer(leaf);
- return ret;
-}
-
-/*
- * scan child blocks of a given block to find blocks require processing
- */
-static int add_child_blocks(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct backref_node *node,
- struct rb_root *blocks)
-{
- struct tree_block *block;
- struct rb_node *rb_node;
- u64 bytenr;
- u64 ptr_gen;
- u32 blocksize;
- u32 nritems;
- int i;
- int err = 0;
-
- nritems = btrfs_header_nritems(node->eb);
- blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
- for (i = 0; i < nritems; i++) {
- cond_resched();
- bytenr = btrfs_node_blockptr(node->eb, i);
- ptr_gen = btrfs_node_ptr_generation(node->eb, i);
- if (ptr_gen == trans->transid)
- continue;
- if (!in_block_group(bytenr, rc->block_group) &&
- (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
- continue;
- if (tree_block_processed(bytenr, blocksize, rc))
- continue;
-
- readahead_tree_block(rc->extent_root,
- bytenr, blocksize, ptr_gen);
- }
-
- for (i = 0; i < nritems; i++) {
- cond_resched();
- bytenr = btrfs_node_blockptr(node->eb, i);
- ptr_gen = btrfs_node_ptr_generation(node->eb, i);
- if (ptr_gen == trans->transid)
- continue;
- if (!in_block_group(bytenr, rc->block_group) &&
- (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
- continue;
- if (tree_block_processed(bytenr, blocksize, rc))
- continue;
- if (!in_block_group(bytenr, rc->block_group) &&
- !check_file_extents(rc, bytenr, blocksize, ptr_gen))
- continue;
-
- block = kmalloc(sizeof(*block), GFP_NOFS);
- if (!block) {
- err = -ENOMEM;
- break;
- }
- block->bytenr = bytenr;
- btrfs_node_key_to_cpu(node->eb, &block->key, i);
- block->level = node->level - 1;
- block->key_ready = 1;
- rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
- BUG_ON(rb_node);
- }
- if (err)
- free_block_list(blocks);
- return err;
-}
-
-/*
- * find adjacent blocks require processing
- */
-static noinline_for_stack
-int add_adjacent_blocks(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct backref_cache *cache,
- struct rb_root *blocks, int level,
- struct backref_node **upper)
-{
- struct backref_node *node;
- int ret = 0;
-
- WARN_ON(!list_empty(&cache->pending[level]));
-
- if (list_empty(&cache->pending[level + 1]))
- return 1;
-
- node = list_entry(cache->pending[level + 1].next,
- struct backref_node, lower);
- if (node->eb)
- ret = add_child_blocks(trans, rc, node, blocks);
-
- *upper = node;
- return ret;
-}
-
static int get_tree_block_key(struct reloc_control *rc,
struct tree_block *block)
{
@@ -2371,40 +2695,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_root *root;
- int ret;
+ int release = 0;
+ int ret = 0;
+ if (!node)
+ return 0;
+
+ BUG_ON(node->processed);
root = select_one_root(trans, node);
- if (unlikely(!root)) {
- rc->found_old_snapshot = 1;
+ if (root == ERR_PTR(-ENOENT)) {
update_processed_blocks(rc, node);
- return 0;
+ goto out;
}
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- ret = do_relocation(trans, node, key, path, 1);
- if (ret < 0)
- goto out;
- if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
- ret = replace_file_extents(trans, rc, root,
- node->eb, NULL);
- if (ret < 0)
- goto out;
- }
- drop_node_buffer(node);
- } else if (!root->ref_cows) {
- path->lowest_level = node->level;
- ret = btrfs_search_slot(trans, root, key, path, 0, 1);
- btrfs_release_path(root, path);
- if (ret < 0)
+ if (!root || root->ref_cows) {
+ ret = reserve_metadata_space(trans, rc, node);
+ if (ret)
goto out;
- } else if (root != node->root) {
- WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+ release = 1;
}
- update_processed_blocks(rc, node);
- ret = 0;
+ if (root) {
+ if (root->ref_cows) {
+ BUG_ON(node->new_bytenr);
+ BUG_ON(!list_empty(&node->list));
+ btrfs_record_root_in_trans(trans, root);
+ root = root->reloc_root;
+ node->new_bytenr = root->node->start;
+ node->root = root;
+ list_add_tail(&node->list, &rc->backref_cache.changed);
+ } else {
+ path->lowest_level = node->level;
+ ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+ btrfs_release_path(root, path);
+ if (ret > 0)
+ ret = 0;
+ }
+ if (!ret)
+ update_processed_blocks(rc, node);
+ } else {
+ ret = do_relocation(trans, rc, node, key, path, 1);
+ }
out:
- drop_node_buffer(node);
+ if (ret || node->level == 0 || node->cowonly) {
+ if (release)
+ release_metadata_space(rc, node);
+ remove_backref_node(&rc->backref_cache, node);
+ }
return ret;
}
@@ -2415,12 +2752,10 @@ static noinline_for_stack
int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct reloc_control *rc, struct rb_root *blocks)
{
- struct backref_cache *cache;
struct backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
struct rb_node *rb_node;
- int level = -1;
int ret;
int err = 0;
@@ -2428,21 +2763,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- cache = kmalloc(sizeof(*cache), GFP_NOFS);
- if (!cache) {
- btrfs_free_path(path);
- return -ENOMEM;
- }
-
- backref_cache_init(cache);
-
rb_node = rb_first(blocks);
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
- if (level == -1)
- level = block->level;
- else
- BUG_ON(level != block->level);
if (!block->key_ready)
reada_tree_block(rc, block);
rb_node = rb_next(rb_node);
@@ -2460,7 +2783,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
- node = build_backref_tree(rc, cache, &block->key,
+ node = build_backref_tree(rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
err = PTR_ERR(node);
@@ -2470,79 +2793,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- err = ret;
+ if (ret != -EAGAIN || rb_node == rb_first(blocks))
+ err = ret;
goto out;
}
- remove_backref_node(cache, node);
rb_node = rb_next(rb_node);
}
-
- if (level > 0)
- goto out;
-
+out:
free_block_list(blocks);
+ err = finish_pending_nodes(trans, rc, path, err);
- /*
- * now backrefs of some upper level tree blocks have been cached,
- * try relocating blocks referenced by these upper level blocks.
- */
- while (1) {
- struct backref_node *upper = NULL;
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing)
- break;
+ btrfs_free_path(path);
+ return err;
+}
- ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
- &upper);
- if (ret < 0)
- err = ret;
- if (ret != 0)
- break;
+static noinline_for_stack
+int prealloc_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
+{
+ u64 alloc_hint = 0;
+ u64 start;
+ u64 end;
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ u64 num_bytes;
+ int nr = 0;
+ int ret = 0;
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing)
- goto out;
- BUG_ON(!block->key_ready);
- node = build_backref_tree(rc, cache, &block->key,
- level, block->bytenr);
- if (IS_ERR(node)) {
- err = PTR_ERR(node);
- goto out;
- }
+ BUG_ON(cluster->start != cluster->boundary[0]);
+ mutex_lock(&inode->i_mutex);
- ret = relocate_tree_block(trans, rc, node,
- &block->key, path);
- if (ret < 0) {
- err = ret;
- goto out;
- }
- remove_backref_node(cache, node);
- rb_node = rb_next(rb_node);
- }
- free_block_list(blocks);
+ ret = btrfs_check_data_free_space(inode, cluster->end +
+ 1 - cluster->start);
+ if (ret)
+ goto out;
- if (upper) {
- ret = link_to_upper(trans, upper, path);
- if (ret < 0) {
- err = ret;
- break;
- }
- remove_backref_node(cache, upper);
- }
+ while (nr < cluster->nr) {
+ start = cluster->boundary[nr] - offset;
+ if (nr + 1 < cluster->nr)
+ end = cluster->boundary[nr + 1] - 1 - offset;
+ else
+ end = cluster->end - offset;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ num_bytes = end + 1 - start;
+ ret = btrfs_prealloc_file_range(inode, 0, start,
+ num_bytes, num_bytes,
+ end + 1, &alloc_hint);
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ if (ret)
+ break;
+ nr++;
}
+ btrfs_free_reserved_data_space(inode, cluster->end +
+ 1 - cluster->start);
out:
- free_block_list(blocks);
-
- ret = finish_pending_nodes(trans, cache, path);
- if (ret < 0)
- err = ret;
-
- kfree(cache);
- btrfs_free_path(path);
- return err;
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
static noinline_for_stack
@@ -2588,7 +2894,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
- unsigned int dirty_page = 0;
struct page *page;
struct file_ra_state *ra;
int nr = 0;
@@ -2601,21 +2906,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!ra)
return -ENOMEM;
- index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
- last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+ ret = prealloc_file_extent_cluster(inode, cluster);
+ if (ret)
+ goto out;
- mutex_lock(&inode->i_mutex);
+ file_ra_state_init(ra, inode->i_mapping);
- i_size_write(inode, cluster->end + 1 - offset);
ret = setup_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
- goto out_unlock;
-
- file_ra_state_init(ra, inode->i_mapping);
+ goto out;
- WARN_ON(cluster->start != cluster->boundary[0]);
+ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
while (index <= last_index) {
+ ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+ if (ret)
+ goto out;
+
page = find_lock_page(inode->i_mapping, index);
if (!page) {
page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2931,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
last_index + 1 - index);
page = grab_cache_page(inode->i_mapping, index);
if (!page) {
+ btrfs_delalloc_release_metadata(inode,
+ PAGE_CACHE_SIZE);
ret = -ENOMEM;
- goto out_unlock;
+ goto out;
}
}
@@ -2640,8 +2950,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
+ btrfs_delalloc_release_metadata(inode,
+ PAGE_CACHE_SIZE);
ret = -EIO;
- goto out_unlock;
+ goto out;
}
}
@@ -2660,10 +2972,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
EXTENT_BOUNDARY, GFP_NOFS);
nr++;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+ btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
set_page_dirty(page);
- dirty_page++;
unlock_extent(&BTRFS_I(inode)->io_tree,
page_start, page_end, GFP_NOFS);
@@ -2671,20 +2982,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
page_cache_release(page);
index++;
- if (nr < cluster->nr &&
- page_end + 1 + offset == cluster->boundary[nr]) {
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- dirty_page);
- dirty_page = 0;
- }
- }
- if (dirty_page) {
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- dirty_page);
+ balance_dirty_pages_ratelimited(inode->i_mapping);
+ btrfs_throttle(BTRFS_I(inode)->root);
}
WARN_ON(nr != cluster->nr);
-out_unlock:
- mutex_unlock(&inode->i_mutex);
+out:
kfree(ra);
return ret;
}
@@ -2870,9 +3172,6 @@ out:
static int block_use_full_backref(struct reloc_control *rc,
struct extent_buffer *eb)
{
- struct btrfs_path *path;
- struct btrfs_extent_item *ei;
- struct btrfs_key key;
u64 flags;
int ret;
@@ -2880,28 +3179,14 @@ static int block_use_full_backref(struct reloc_control *rc,
btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
return 1;
- path = btrfs_alloc_path();
- BUG_ON(!path);
-
- key.objectid = eb->start;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = eb->len;
-
- path->search_commit_root = 1;
- path->skip_locking = 1;
- ret = btrfs_search_slot(NULL, rc->extent_root,
- &key, path, 0, 0);
+ ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
+ eb->start, eb->len, NULL, &flags);
BUG_ON(ret);
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_extent_item);
- flags = btrfs_extent_flags(path->nodes[0], ei);
- BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
ret = 1;
else
ret = 0;
- btrfs_free_path(path);
return ret;
}
@@ -3074,22 +3359,10 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_extent_inline_ref *iref;
unsigned long ptr;
unsigned long end;
- u32 blocksize;
+ u32 blocksize = btrfs_level_size(rc->extent_root, 0);
int ret;
int err = 0;
- ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
- extent_key->offset);
- BUG_ON(ret < 0);
- if (ret > 0) {
- /* the relocated data is fragmented */
- rc->extents_skipped++;
- btrfs_release_path(rc->extent_root, path);
- return 0;
- }
-
- blocksize = btrfs_level_size(rc->extent_root, 0);
-
eb = path->nodes[0];
ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3443,8 @@ int add_data_references(struct reloc_control *rc,
*/
static noinline_for_stack
int find_next_extent(struct btrfs_trans_handle *trans,
- struct reloc_control *rc, struct btrfs_path *path)
+ struct reloc_control *rc, struct btrfs_path *path,
+ struct btrfs_key *extent_key)
{
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -3225,6 +3499,7 @@ next:
rc->search_start = end + 1;
} else {
rc->search_start = key.objectid + key.offset;
+ memcpy(extent_key, &key, sizeof(key));
return 0;
}
}
@@ -3262,12 +3537,49 @@ static int check_extent_flags(u64 flags)
return 0;
}
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+ if (!rc->block_rsv)
+ return -ENOMEM;
+
+ /*
+ * reserve some space for creating reloc trees.
+ * btrfs_init_reloc_root will use them when there
+ * is no reservation in transaction handle.
+ */
+ ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+ rc->extent_root->nodesize * 256,
+ &rc->block_rsv_retries);
+ if (ret)
+ return ret;
+
+ rc->block_rsv->refill_used = 1;
+ btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
+
+ memset(&rc->cluster, 0, sizeof(rc->cluster));
+ rc->search_start = rc->block_group->key.objectid;
+ rc->extents_found = 0;
+ rc->nodes_relocated = 0;
+ rc->merging_rsv_size = 0;
+ rc->block_rsv_retries = 0;
+
+ rc->create_reloc_tree = 1;
+ set_reloc_control(rc);
+
+ trans = btrfs_join_transaction(rc->extent_root, 1);
+ btrfs_commit_transaction(trans, rc->extent_root);
+ return 0;
+}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
- struct file_extent_cluster *cluster;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
@@ -3277,33 +3589,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
int ret;
int err = 0;
- cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
- if (!cluster)
- return -ENOMEM;
-
path = btrfs_alloc_path();
- if (!path) {
- kfree(cluster);
+ if (!path)
return -ENOMEM;
- }
-
- rc->extents_found = 0;
- rc->extents_skipped = 0;
-
- rc->search_start = rc->block_group->key.objectid;
- clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
- GFP_NOFS);
-
- rc->create_reloc_root = 1;
- set_reloc_control(rc);
- trans = btrfs_start_transaction(rc->extent_root, 1);
- btrfs_commit_transaction(trans, rc->extent_root);
+ ret = prepare_to_relocate(rc);
+ if (ret) {
+ err = ret;
+ goto out_free;
+ }
while (1) {
- trans = btrfs_start_transaction(rc->extent_root, 1);
+ trans = btrfs_start_transaction(rc->extent_root, 0);
+
+ if (update_backref_cache(trans, &rc->backref_cache)) {
+ btrfs_end_transaction(trans, rc->extent_root);
+ continue;
+ }
- ret = find_next_extent(trans, rc, path);
+ ret = find_next_extent(trans, rc, path, &key);
if (ret < 0)
err = ret;
if (ret != 0)
@@ -3313,9 +3617,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
- btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- item_size = btrfs_item_size_nr(path->nodes[0],
- path->slots[0]);
+ item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
if (item_size >= sizeof(*ei)) {
flags = btrfs_extent_flags(path->nodes[0], ei);
ret = check_extent_flags(flags);
@@ -3356,73 +3658,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
} else if (rc->stage == UPDATE_DATA_PTRS &&
- (flags & BTRFS_EXTENT_FLAG_DATA)) {
+ (flags & BTRFS_EXTENT_FLAG_DATA)) {
ret = add_data_references(rc, &key, path, &blocks);
} else {
btrfs_release_path(rc->extent_root, path);
ret = 0;
}
if (ret < 0) {
- err = 0;
+ err = ret;
break;
}
if (!RB_EMPTY_ROOT(&blocks)) {
ret = relocate_tree_blocks(trans, rc, &blocks);
if (ret < 0) {
+ if (ret != -EAGAIN) {
+ err = ret;
+ break;
+ }
+ rc->extents_found--;
+ rc->search_start = key.objectid;
+ }
+ }
+
+ ret = btrfs_block_rsv_check(trans, rc->extent_root,
+ rc->block_rsv, 0, 5);
+ if (ret < 0) {
+ if (ret != -EAGAIN) {
err = ret;
+ WARN_ON(1);
break;
}
+ rc->commit_transaction = 1;
}
- nr = trans->blocks_used;
- btrfs_end_transaction(trans, rc->extent_root);
+ if (rc->commit_transaction) {
+ rc->commit_transaction = 0;
+ ret = btrfs_commit_transaction(trans, rc->extent_root);
+ BUG_ON(ret);
+ } else {
+ nr = trans->blocks_used;
+ btrfs_end_transaction_throttle(trans, rc->extent_root);
+ btrfs_btree_balance_dirty(rc->extent_root, nr);
+ }
trans = NULL;
- btrfs_btree_balance_dirty(rc->extent_root, nr);
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = 1;
ret = relocate_data_extent(rc->data_inode,
- &key, cluster);
+ &key, &rc->cluster);
if (ret < 0) {
err = ret;
break;
}
}
}
- btrfs_free_path(path);
+
+ btrfs_release_path(rc->extent_root, path);
+ clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+ GFP_NOFS);
if (trans) {
nr = trans->blocks_used;
- btrfs_end_transaction(trans, rc->extent_root);
+ btrfs_end_transaction_throttle(trans, rc->extent_root);
btrfs_btree_balance_dirty(rc->extent_root, nr);
}
if (!err) {
- ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+ ret = relocate_file_extent_cluster(rc->data_inode,
+ &rc->cluster);
if (ret < 0)
err = ret;
}
- kfree(cluster);
+ rc->create_reloc_tree = 0;
+ set_reloc_control(rc);
- rc->create_reloc_root = 0;
- smp_mb();
+ backref_cache_cleanup(&rc->backref_cache);
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
- if (rc->extents_found > 0) {
- trans = btrfs_start_transaction(rc->extent_root, 1);
- btrfs_commit_transaction(trans, rc->extent_root);
- }
+ err = prepare_to_merge(rc, err);
merge_reloc_roots(rc);
+ rc->merge_reloc_tree = 0;
unset_reloc_control(rc);
+ btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
/* get rid of pinned extents */
- trans = btrfs_start_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root, 1);
btrfs_commit_transaction(trans, rc->extent_root);
-
+out_free:
+ btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+ btrfs_free_path(path);
return err;
}
@@ -3448,7 +3777,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+ btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+ BTRFS_INODE_PREALLOC);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(root, path);
out:
@@ -3460,8 +3790,9 @@ out:
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
*/
-static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *group)
+static noinline_for_stack
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group_cache *group)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
@@ -3475,8 +3806,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
if (IS_ERR(root))
return ERR_CAST(root);
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
+ trans = btrfs_start_transaction(root, 6);
+ if (IS_ERR(trans))
+ return ERR_CAST(trans);
err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
if (err)
@@ -3496,7 +3828,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
out:
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
-
btrfs_btree_balance_dirty(root, nr);
if (err) {
if (inode)
@@ -3506,6 +3837,21 @@ out:
return inode;
}
+static struct reloc_control *alloc_reloc_control(void)
+{
+ struct reloc_control *rc;
+
+ rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ if (!rc)
+ return NULL;
+
+ INIT_LIST_HEAD(&rc->reloc_roots);
+ backref_cache_init(&rc->backref_cache);
+ mapping_tree_init(&rc->reloc_root_tree);
+ extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+ return rc;
+}
+
/*
* function to relocate all extents in a block group.
*/
@@ -3514,24 +3860,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
struct btrfs_fs_info *fs_info = extent_root->fs_info;
struct reloc_control *rc;
int ret;
+ int rw = 0;
int err = 0;
- rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ rc = alloc_reloc_control();
if (!rc)
return -ENOMEM;
- mapping_tree_init(&rc->reloc_root_tree);
- extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
- INIT_LIST_HEAD(&rc->reloc_roots);
+ rc->extent_root = extent_root;
rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
BUG_ON(!rc->block_group);
- btrfs_init_workers(&rc->workers, "relocate",
- fs_info->thread_pool_size, NULL);
-
- rc->extent_root = extent_root;
- btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+ if (!rc->block_group->ro) {
+ ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
+ rw = 1;
+ }
rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3896,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
while (1) {
- rc->extents_found = 0;
- rc->extents_skipped = 0;
-
mutex_lock(&fs_info->cleaner_mutex);
btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3904,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
err = ret;
- break;
+ goto out;
}
if (rc->extents_found == 0)
@@ -3573,18 +3918,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
invalidate_mapping_pages(rc->data_inode->i_mapping,
0, -1);
rc->stage = UPDATE_DATA_PTRS;
- } else if (rc->stage == UPDATE_DATA_PTRS &&
- rc->extents_skipped >= rc->extents_found) {
- iput(rc->data_inode);
- rc->data_inode = create_reloc_inode(fs_info,
- rc->block_group);
- if (IS_ERR(rc->data_inode)) {
- err = PTR_ERR(rc->data_inode);
- rc->data_inode = NULL;
- break;
- }
- rc->stage = MOVE_DATA_EXTENTS;
- rc->found_file_extent = 0;
}
}
@@ -3597,8 +3930,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
WARN_ON(rc->block_group->reserved > 0);
WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
out:
+ if (err && rw)
+ btrfs_set_block_group_rw(extent_root, rc->block_group);
iput(rc->data_inode);
- btrfs_stop_workers(&rc->workers);
btrfs_put_block_group(rc->block_group);
kfree(rc);
return err;
@@ -3609,7 +3943,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
int ret;
- trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
@@ -3702,20 +4036,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
if (list_empty(&reloc_roots))
goto out;
- rc = kzalloc(sizeof(*rc), GFP_NOFS);
+ rc = alloc_reloc_control();
if (!rc) {
err = -ENOMEM;
goto out;
}
- mapping_tree_init(&rc->reloc_root_tree);
- INIT_LIST_HEAD(&rc->reloc_roots);
- btrfs_init_workers(&rc->workers, "relocate",
- root->fs_info->thread_pool_size, NULL);
rc->extent_root = root->fs_info->extent_root;
set_reloc_control(rc);
+ trans = btrfs_join_transaction(rc->extent_root, 1);
+
+ rc->merge_reloc_tree = 1;
+
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
@@ -3735,20 +4069,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root->reloc_root = reloc_root;
}
- trans = btrfs_start_transaction(rc->extent_root, 1);
btrfs_commit_transaction(trans, rc->extent_root);
merge_reloc_roots(rc);
unset_reloc_control(rc);
- trans = btrfs_start_transaction(rc->extent_root, 1);
+ trans = btrfs_join_transaction(rc->extent_root, 1);
btrfs_commit_transaction(trans, rc->extent_root);
out:
- if (rc) {
- btrfs_stop_workers(&rc->workers);
- kfree(rc);
- }
+ kfree(rc);
while (!list_empty(&reloc_roots)) {
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
@@ -3814,3 +4144,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
btrfs_put_ordered_extent(ordered);
return 0;
}
+
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct extent_buffer *buf,
+ struct extent_buffer *cow)
+{
+ struct reloc_control *rc;
+ struct backref_node *node;
+ int first_cow = 0;
+ int level;
+ int ret;
+
+ rc = root->fs_info->reloc_ctl;
+ if (!rc)
+ return;
+
+ BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+ root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+
+ level = btrfs_header_level(buf);
+ if (btrfs_header_generation(buf) <=
+ btrfs_root_last_snapshot(&root->root_item))
+ first_cow = 1;
+
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+ rc->create_reloc_tree) {
+ WARN_ON(!first_cow && level == 0);
+
+ node = rc->backref_cache.path[level];
+ BUG_ON(node->bytenr != buf->start &&
+ node->new_bytenr != buf->start);
+
+ drop_node_buffer(node);
+ extent_buffer_get(cow);
+ node->eb = cow;
+ node->new_bytenr = cow->start;
+
+ if (!node->pending) {
+ list_move_tail(&node->list,
+ &rc->backref_cache.pending[level]);
+ node->pending = 1;
+ }
+
+ if (first_cow)
+ __mark_block_processed(rc, node);
+
+ if (first_cow && level > 0)
+ rc->nodes_relocated += buf->len;
+ }
+
+ if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+ ret = replace_file_extents(trans, rc, root, cow);
+ BUG_ON(ret);
+ }
+}
+
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending,
+ u64 *bytes_to_reserve)
+{
+ struct btrfs_root *root;
+ struct reloc_control *rc;
+
+ root = pending->root;
+ if (!root->reloc_root)
+ return;
+
+ rc = root->fs_info->reloc_ctl;
+ if (!rc->merge_reloc_tree)
+ return;
+
+ root = root->reloc_root;
+ BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+ /*
+ * relocation is in the stage of merging trees. the space
+ * used by merging a reloc tree is twice the size of
+ * relocated tree nodes in the worst case. half for cowing
+ * the reloc tree, half for cowing the fs tree. the space
+ * used by cowing the reloc tree will be freed after the
+ * tree is dropped. if we create snapshot, cowing the fs
+ * tree may use more space than it frees. so we need
+ * reserve extra space.
+ */
+ *bytes_to_reserve += rc->nodes_relocated;
+}
+
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+ struct btrfs_pending_snapshot *pending)
+{
+ struct btrfs_root *root = pending->root;
+ struct btrfs_root *reloc_root;
+ struct btrfs_root *new_root;
+ struct reloc_control *rc;
+ int ret;
+
+ if (!root->reloc_root)
+ return;
+
+ rc = root->fs_info->reloc_ctl;
+ rc->merging_rsv_size += rc->nodes_relocated;
+
+ if (rc->merge_reloc_tree) {
+ ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+ rc->block_rsv,
+ rc->nodes_relocated);
+ BUG_ON(ret);
+ }
+
+ new_root = pending->snap;
+ reloc_root = create_reloc_root(trans, root->reloc_root,
+ new_root->root_key.objectid);
+
+ __add_reloc_root(reloc_root);
+ new_root->reloc_root = reloc_root;
+
+ if (rc->create_reloc_tree) {
+ ret = clone_backref_node(trans, rc, root, reloc_root);
+ BUG_ON(ret);
+ }
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..b91ccd972644 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_key key;
+ struct btrfs_key root_key;
+ struct btrfs_root *root;
int err = 0;
int ret;
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = 0;
+ root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root_key.offset = (u64)-1;
+
while (1) {
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
- ret = btrfs_find_dead_roots(tree_root, key.offset);
- if (ret) {
+ root_key.objectid = key.offset;
+ key.offset++;
+
+ root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+ &root_key);
+ if (!IS_ERR(root))
+ continue;
+
+ ret = PTR_ERR(root);
+ if (ret != -ENOENT) {
err = ret;
break;
}
- key.offset++;
+ ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+ if (ret) {
+ err = ret;
+ break;
+ }
}
btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2909a03e5230..d34b2dfc9628 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -498,7 +498,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
btrfs_start_delalloc_inodes(root, 0);
btrfs_wait_ordered_extents(root, 0, 0);
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
ret = btrfs_commit_transaction(trans, root);
return ret;
}
@@ -694,11 +694,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
return -EINVAL;
- /* recover relocation */
- ret = btrfs_recover_relocation(root);
+ ret = btrfs_cleanup_fs_roots(root->fs_info);
WARN_ON(ret);
- ret = btrfs_cleanup_fs_roots(root->fs_info);
+ /* recover relocation */
+ ret = btrfs_recover_relocation(root);
WARN_ON(ret);
sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct list_head *head = &root->fs_info->space_info;
struct btrfs_space_info *found;
u64 total_used = 0;
- u64 data_used = 0;
int bits = dentry->d_sb->s_blocksize_bits;
__be32 *fsid = (__be32 *)root->fs_info->fsid;
rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
- BTRFS_BLOCK_GROUP_RAID10|
- BTRFS_BLOCK_GROUP_RAID1)) {
- total_used += found->bytes_used;
- if (found->flags & BTRFS_BLOCK_GROUP_DATA)
- data_used += found->bytes_used;
- else
- data_used += found->total_bytes;
- }
-
- total_used += found->bytes_used;
- if (found->flags & BTRFS_BLOCK_GROUP_DATA)
- data_used += found->bytes_used;
- else
- data_used += found->total_bytes;
- }
+ list_for_each_entry_rcu(found, head, list)
+ total_used += found->disk_used;
rcu_read_unlock();
buf->f_namelen = BTRFS_NAME_LEN;
buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
buf->f_bfree = buf->f_blocks - (total_used >> bits);
- buf->f_bavail = buf->f_blocks - (data_used >> bits);
+ buf->f_bavail = buf->f_bfree;
buf->f_bsize = dentry->d_sb->s_blocksize;
buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
TRANS_USERSPACE,
};
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+ if (!root->fs_info->log_root_recovering &&
+ ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+ type == TRANS_USERSPACE))
+ return 1;
+ return 0;
+}
+
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
- int num_blocks, int type)
+ u64 num_items, int type)
{
- struct btrfs_trans_handle *h =
- kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ struct btrfs_trans_handle *h;
+ struct btrfs_transaction *cur_trans;
+ int retries = 0;
int ret;
+again:
+ h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ if (!h)
+ return ERR_PTR(-ENOMEM);
mutex_lock(&root->fs_info->trans_mutex);
- if (!root->fs_info->log_root_recovering &&
- ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
- type == TRANS_USERSPACE))
+ if (may_wait_transaction(root, type))
wait_current_trans(root);
+
ret = join_transaction(root);
BUG_ON(ret);
- h->transid = root->fs_info->running_transaction->transid;
- h->transaction = root->fs_info->running_transaction;
- h->blocks_reserved = num_blocks;
+ cur_trans = root->fs_info->running_transaction;
+ cur_trans->use_count++;
+ mutex_unlock(&root->fs_info->trans_mutex);
+
+ h->transid = cur_trans->transid;
+ h->transaction = cur_trans;
h->blocks_used = 0;
h->block_group = 0;
- h->alloc_exclude_nr = 0;
- h->alloc_exclude_start = 0;
+ h->bytes_reserved = 0;
h->delayed_ref_updates = 0;
+ h->block_rsv = NULL;
- if (!current->journal_info && type != TRANS_USERSPACE)
- current->journal_info = h;
+ smp_mb();
+ if (cur_trans->blocked && may_wait_transaction(root, type)) {
+ btrfs_commit_transaction(h, root);
+ goto again;
+ }
+
+ if (num_items > 0) {
+ ret = btrfs_trans_reserve_metadata(h, root, num_items,
+ &retries);
+ if (ret == -EAGAIN) {
+ btrfs_commit_transaction(h, root);
+ goto again;
+ }
+ if (ret < 0) {
+ btrfs_end_transaction(h, root);
+ return ERR_PTR(ret);
+ }
+ }
- root->fs_info->running_transaction->use_count++;
+ mutex_lock(&root->fs_info->trans_mutex);
record_root_in_trans(h, root);
mutex_unlock(&root->fs_info->trans_mutex);
+
+ if (!current->journal_info && type != TRANS_USERSPACE)
+ current->journal_info = h;
return h;
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_blocks)
+ int num_items)
{
- return start_transaction(root, num_blocks, TRANS_START);
+ return start_transaction(root, num_items, TRANS_START);
}
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
int num_blocks)
{
- return start_transaction(root, num_blocks, TRANS_JOIN);
+ return start_transaction(root, 0, TRANS_JOIN);
}
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
int num_blocks)
{
- return start_transaction(r, num_blocks, TRANS_USERSPACE);
+ return start_transaction(r, 0, TRANS_USERSPACE);
}
/* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
mutex_unlock(&root->fs_info->trans_mutex);
}
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ int ret;
+ ret = btrfs_block_rsv_check(trans, root,
+ &root->fs_info->global_block_rsv, 0, 5);
+ return ret ? 1 : 0;
+}
+
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int updates;
+
+ if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+ return 1;
+
+ updates = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (updates)
+ btrfs_run_delayed_refs(trans, root, updates);
+
+ return should_end_transaction(trans, root);
+}
+
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int throttle)
{
- struct btrfs_transaction *cur_trans;
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *info = root->fs_info;
int count = 0;
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
count++;
}
+ btrfs_trans_release_metadata(trans, root);
+
+ if (!root->fs_info->open_ioctl_trans &&
+ should_end_transaction(trans, root))
+ trans->transaction->blocked = 1;
+
+ if (cur_trans->blocked && !cur_trans->in_commit) {
+ if (throttle)
+ return btrfs_commit_transaction(trans, root);
+ else
+ wake_up_process(info->transaction_kthread);
+ }
+
mutex_lock(&info->trans_mutex);
- cur_trans = info->running_transaction;
- WARN_ON(cur_trans != trans->transaction);
+ WARN_ON(cur_trans != info->running_transaction);
WARN_ON(cur_trans->num_writers < 1);
cur_trans->num_writers--;
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_free_log(trans, root);
btrfs_update_reloc_root(trans, root);
+ btrfs_orphan_commit_root(trans, root);
if (root->commit_root != root->node) {
switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
struct btrfs_fs_info *info = root->fs_info;
- int ret;
struct btrfs_trans_handle *trans;
+ int ret;
unsigned long nr;
- smp_mb();
- if (root->defrag_running)
+ if (xchg(&root->defrag_running, 1))
return 0;
- trans = btrfs_start_transaction(root, 1);
+
while (1) {
- root->defrag_running = 1;
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
ret = btrfs_defrag_leaves(trans, root, cacheonly);
+
nr = trans->blocks_used;
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(info->tree_root, nr);
cond_resched();
- trans = btrfs_start_transaction(root, 1);
if (root->fs_info->closing || ret != -EAGAIN)
break;
}
root->defrag_running = 0;
- smp_mb();
- btrfs_end_transaction(trans, root);
- return 0;
+ return ret;
}
#if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct inode *parent_inode;
+ struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
int ret;
- u64 objectid;
- int namelen;
+ int retries = 0;
+ u64 to_reserve = 0;
u64 index = 0;
-
- parent_inode = pending->dentry->d_parent->d_inode;
- parent_root = BTRFS_I(parent_inode)->root;
+ u64 objectid;
new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
if (!new_root_item) {
- ret = -ENOMEM;
+ pending->error = -ENOMEM;
goto fail;
}
+
ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
- if (ret)
+ if (ret) {
+ pending->error = ret;
goto fail;
+ }
+
+ btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+ btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
+
+ if (to_reserve > 0) {
+ ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+ to_reserve, &retries);
+ if (ret) {
+ pending->error = ret;
+ goto fail;
+ }
+ }
key.objectid = objectid;
- /* record when the snapshot was created in key.offset */
- key.offset = trans->transid;
- btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+ key.offset = (u64)-1;
+ key.type = BTRFS_ROOT_ITEM_KEY;
- memcpy(&pending->root_key, &key, sizeof(key));
- pending->root_key.offset = (u64)-1;
+ trans->block_rsv = &pending->block_rsv;
+ dentry = pending->dentry;
+ parent_inode = dentry->d_parent->d_inode;
+ parent_root = BTRFS_I(parent_inode)->root;
record_root_in_trans(trans, parent_root);
+
/*
* insert the directory item
*/
- namelen = strlen(pending->name);
ret = btrfs_set_inode_index(parent_inode, &index);
BUG_ON(ret);
ret = btrfs_insert_dir_item(trans, parent_root,
- pending->name, namelen,
- parent_inode->i_ino,
- &pending->root_key, BTRFS_FT_DIR, index);
+ dentry->d_name.name, dentry->d_name.len,
+ parent_inode->i_ino, &key,
+ BTRFS_FT_DIR, index);
BUG_ON(ret);
- btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+ btrfs_i_size_write(parent_inode, parent_inode->i_size +
+ dentry->d_name.len * 2);
ret = btrfs_update_inode(trans, parent_root, parent_inode);
BUG_ON(ret);
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
free_extent_buffer(old);
btrfs_set_root_node(new_root_item, tmp);
- ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
- new_root_item);
- BUG_ON(ret);
+ /* record when the snapshot was created in key.offset */
+ key.offset = trans->transid;
+ ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
+ BUG_ON(ret);
- ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
- pending->root_key.objectid,
+ /*
+ * insert root back/forward references
+ */
+ ret = btrfs_add_root_ref(trans, tree_root, objectid,
parent_root->root_key.objectid,
- parent_inode->i_ino, index, pending->name,
- namelen);
+ parent_inode->i_ino, index,
+ dentry->d_name.name, dentry->d_name.len);
BUG_ON(ret);
+ key.offset = (u64)-1;
+ pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ BUG_ON(IS_ERR(pending->snap));
+
+ btrfs_reloc_post_snapshot(trans, pending);
+ btrfs_orphan_post_snapshot(trans, pending);
fail:
kfree(new_root_item);
- return ret;
+ btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
+ return 0;
}
/*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
return ret;
}
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+ int ret = 0;
+ spin_lock(&info->new_trans_lock);
+ if (info->running_transaction)
+ ret = info->running_transaction->blocked;
+ spin_unlock(&info->new_trans_lock);
+ return ret;
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_refs(trans, root, 0);
BUG_ON(ret);
+ btrfs_trans_release_metadata(trans, root);
+
cur_trans = trans->transaction;
/*
* set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
snap_pending = 1;
WARN_ON(cur_trans != trans->transaction);
- prepare_to_wait(&cur_trans->writer_wait, &wait,
- TASK_UNINTERRUPTIBLE);
-
if (cur_trans->num_writers > 1)
timeout = MAX_SCHEDULE_TIMEOUT;
else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
btrfs_run_ordered_operations(root, 1);
+ prepare_to_wait(&cur_trans->writer_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
smp_mb();
if (cur_trans->num_writers > 1 || should_grow)
schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
- btrfs_drop_snapshot(root, 0);
+ btrfs_drop_snapshot(root, NULL, 0);
else
- btrfs_drop_snapshot(root, 1);
+ btrfs_drop_snapshot(root, NULL, 1);
}
return 0;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
struct btrfs_trans_handle {
u64 transid;
+ u64 block_group;
+ u64 bytes_reserved;
unsigned long blocks_reserved;
unsigned long blocks_used;
- struct btrfs_transaction *transaction;
- u64 block_group;
- u64 alloc_exclude_start;
- u64 alloc_exclude_nr;
unsigned long delayed_ref_updates;
+ struct btrfs_transaction *transaction;
+ struct btrfs_block_rsv *block_rsv;
};
struct btrfs_pending_snapshot {
struct dentry *dentry;
struct btrfs_root *root;
- char *name;
- struct btrfs_key root_key;
+ struct btrfs_root *snap;
+ /* block reservation for the operation */
+ struct btrfs_block_rsv block_rsv;
+ /* extra metadata reseration for relocation */
+ int error;
struct list_head list;
};
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_blocks);
+ int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
- int num_blocks);
+ int num_blocks);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
- int num_blocks);
+ int num_blocks);
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
void btrfs_throttle(struct btrfs_root *root);
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
path->nodes[1], 0,
cache_only, &last_ret,
&root->defrag_progress);
- WARN_ON(ret && ret != -EAGAIN);
+ if (ret) {
+ WARN_ON(ret == -EAGAIN);
+ goto out;
+ }
if (next_key_ret == 0) {
memcpy(&root->defrag_progress, &key, sizeof(key));
ret = -EAGAIN;
}
-
- btrfs_release_path(root, path);
out:
if (path)
btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
int ret;
+ int err = 0;
mutex_lock(&root->log_mutex);
if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->tree_log_mutex);
if (!root->fs_info->log_root_tree) {
ret = btrfs_init_log_root_tree(trans, root->fs_info);
- BUG_ON(ret);
+ if (ret)
+ err = ret;
}
- if (!root->log_root) {
+ if (err == 0 && !root->log_root) {
ret = btrfs_add_log_tree(trans, root);
- BUG_ON(ret);
+ if (ret)
+ err = ret;
}
mutex_unlock(&root->fs_info->tree_log_mutex);
root->log_batch++;
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
- return 0;
+ return err;
}
/*
@@ -376,7 +379,7 @@ insert:
BUG_ON(ret);
}
} else if (ret) {
- BUG();
+ return ret;
}
dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
- wc->process_func(root, next, wc, ptr_gen);
-
if (*level == 1) {
+ wc->process_func(root, next, wc, ptr_gen);
+
path->slots[*level]++;
if (wc->free) {
btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
- if (path->nodes[*level] == root->node)
- parent = path->nodes[*level];
- else
- parent = path->nodes[*level + 1];
-
- bytenr = path->nodes[*level]->start;
-
- blocksize = btrfs_level_size(root, *level);
- root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
-
- wc->process_func(root, path->nodes[*level], wc,
- btrfs_header_generation(path->nodes[*level]));
-
- if (wc->free) {
- next = path->nodes[*level];
- btrfs_tree_lock(next);
- clean_tree_block(trans, root, next);
- btrfs_set_lock_blocking(next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
-
- WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
- ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
- BUG_ON(ret);
- }
- free_extent_buffer(path->nodes[*level]);
- path->nodes[*level] = NULL;
- *level += 1;
+ path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
cond_resched();
return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
slot = path->slots[i];
- if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+ if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
struct extent_buffer *node;
node = path->nodes[i];
path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
ret = update_log_root(trans, log);
- BUG_ON(ret);
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
wake_up(&log_root_tree->log_writer_wait);
}
+ if (ret) {
+ BUG_ON(ret != -ENOSPC);
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out;
+ }
+
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
return 0;
}
-/*
- * free all the extents used by the tree log. This should be called
- * at commit time of the full transaction
- */
-int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+static void free_log_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log)
{
int ret;
- struct btrfs_root *log;
- struct key;
u64 start;
u64 end;
struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
.process_func = process_one_buffer
};
- if (!root->log_root || root->fs_info->log_root_recovering)
- return 0;
-
- log = root->log_root;
ret = walk_log_tree(trans, log, &wc);
BUG_ON(ret);
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
}
- if (log->log_transid > 0) {
- ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
- &log->root_key);
- BUG_ON(ret);
- }
- root->log_root = NULL;
free_extent_buffer(log->node);
kfree(log);
+}
+
+/*
+ * free all the extents used by the tree log. This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+ if (root->log_root) {
+ free_log_tree(trans, root->log_root);
+ root->log_root = NULL;
+ }
+ return 0;
+}
+
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->log_root_tree) {
+ free_log_tree(trans, fs_info->log_root_tree);
+ fs_info->log_root_tree = NULL;
+ }
return 0;
}
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di;
struct btrfs_path *path;
int ret;
+ int err = 0;
int bytes_del = 0;
if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
name, name_len, -1);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto fail;
+ }
+ if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
bytes_del += name_len;
BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_release_path(log, path);
di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
index, name, name_len, -1);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ err = PTR_ERR(di);
+ goto fail;
+ }
+ if (di) {
ret = btrfs_delete_one_dir_name(trans, log, path, di);
bytes_del += name_len;
BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_release_path(log, path);
ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+ if (ret < 0) {
+ err = ret;
+ goto fail;
+ }
if (ret == 0) {
struct btrfs_inode_item *item;
u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
ret = 0;
btrfs_release_path(log, path);
}
-
+fail:
btrfs_free_path(path);
mutex_unlock(&BTRFS_I(dir)->log_mutex);
+ if (ret == -ENOSPC) {
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ ret = 0;
+ }
btrfs_end_log_trans(root);
return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
+ if (ret == -ENOSPC) {
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ ret = 0;
+ }
btrfs_end_log_trans(root);
return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
else
key.type = BTRFS_DIR_LOG_INDEX_KEY;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
- BUG_ON(ret);
+ if (ret)
+ return ret;
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src;
+ int err = 0;
int ret;
int i;
int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
&tmp);
+ if (ret) {
+ err = ret;
+ goto done;
+ }
}
}
btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
goto done;
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto done;
+ }
}
path->slots[0] = nritems;
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
ret = overwrite_item(trans, log, dst_path,
path->nodes[0], path->slots[0],
&tmp);
-
- BUG_ON(ret);
- last_offset = tmp.offset;
+ if (ret)
+ err = ret;
+ else
+ last_offset = tmp.offset;
goto done;
}
}
done:
- *last_offset_ret = last_offset;
btrfs_release_path(root, path);
btrfs_release_path(log, dst_path);
- /* insert the log range keys to indicate where the log is valid */
- ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
- first_offset, last_offset);
- BUG_ON(ret);
- return 0;
+ if (err == 0) {
+ *last_offset_ret = last_offset;
+ /*
+ * insert the log range keys to indicate where the log
+ * is valid
+ */
+ ret = insert_dir_log_key(trans, log, path, key_type,
+ inode->i_ino, first_offset,
+ last_offset);
+ if (ret)
+ err = ret;
+ }
+ return err;
}
/*
@@ -2501,7 +2529,8 @@ again:
ret = log_dir_items(trans, root, inode, path,
dst_path, key_type, min_key,
&max_key);
- BUG_ON(ret);
+ if (ret)
+ return ret;
if (max_key == (u64)-1)
break;
min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
-
- if (ret != 1)
+ BUG_ON(ret == 0);
+ if (ret < 0)
break;
if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
btrfs_release_path(log, path);
}
btrfs_release_path(log, path);
- return 0;
+ return ret;
}
static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
}
ret = btrfs_insert_empty_items(trans, log, dst_path,
ins_keys, ins_sizes, nr);
- BUG_ON(ret);
+ if (ret) {
+ kfree(ins_data);
+ return ret;
+ }
for (i = 0; i < nr; i++, dst_path->slots[0]++) {
dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
+ ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
- ret = btrfs_csum_file_blocks(trans, log, sums);
- BUG_ON(ret);
+ if (!ret)
+ ret = btrfs_csum_file_blocks(trans, log, sums);
list_del(&sums->list);
kfree(sums);
}
- return 0;
+ return ret;
}
/* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
struct extent_buffer *src = NULL;
u32 size;
+ int err = 0;
int ret;
int nritems;
int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
}
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
path->keep_locks = 1;
while (1) {
@@ -2768,7 +2805,10 @@ again:
ret = copy_items(trans, log, dst_path, src, ins_start_slot,
ins_nr, inode_only);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
ins_nr = 1;
ins_start_slot = path->slots[0];
next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
ret = copy_items(trans, log, dst_path, src,
ins_start_slot,
ins_nr, inode_only);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
ins_nr = 0;
}
btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
ret = copy_items(trans, log, dst_path, src,
ins_start_slot,
ins_nr, inode_only);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
ins_nr = 0;
}
WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
btrfs_release_path(root, path);
btrfs_release_path(log, dst_path);
ret = log_directory_changes(trans, root, inode, path, dst_path);
- BUG_ON(ret);
+ if (ret) {
+ err = ret;
+ goto out_unlock;
+ }
}
BTRFS_I(inode)->logged_trans = trans->transid;
+out_unlock:
mutex_unlock(&BTRFS_I(inode)->log_mutex);
btrfs_free_path(path);
btrfs_free_path(dst_path);
- return 0;
+ return err;
}
/*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- start_log_trans(trans, root);
+ ret = start_log_trans(trans, root);
+ if (ret)
+ goto end_trans;
ret = btrfs_log_inode(trans, root, inode, inode_only);
- BUG_ON(ret);
+ if (ret)
+ goto end_trans;
/*
* for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
*/
if (S_ISREG(inode->i_mode) &&
BTRFS_I(inode)->generation <= last_committed &&
- BTRFS_I(inode)->last_unlink_trans <= last_committed)
- goto no_parent;
+ BTRFS_I(inode)->last_unlink_trans <= last_committed) {
+ ret = 0;
+ goto end_trans;
+ }
inode_only = LOG_INODE_EXISTS;
while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode, inode_only);
- BUG_ON(ret);
+ if (ret)
+ goto end_trans;
}
if (IS_ROOT(parent))
break;
parent = parent->d_parent;
}
-no_parent:
ret = 0;
+end_trans:
+ if (ret < 0) {
+ BUG_ON(ret != -ENOSPC);
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ ret = 1;
+ }
btrfs_end_log_trans(root);
end_no_trans:
return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
path = btrfs_alloc_path();
BUG_ON(!path);
- trans = btrfs_start_transaction(fs_info->tree_root, 1);
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
wc.trans = trans;
wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
goto error;
}
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
lock_chunks(root);
device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
/* step one, relocate all the extents inside this chunk */
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
- BUG_ON(ret);
+ if (ret)
+ return ret;
- trans = btrfs_start_transaction(root, 1);
+ trans = btrfs_start_transaction(root, 0);
BUG_ON(!trans);
lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
break;
BUG_ON(ret);
- trans = btrfs_start_transaction(dev_root, 1);
+ trans = btrfs_start_transaction(dev_root, 0);
BUG_ON(!trans);
ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
}
/* Shrinking succeeded, else we would be at "done". */
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto done;
- }
+ trans = btrfs_start_transaction(root, 0);
lock_chunks(root);
device->disk_total_bytes = new_size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 59acd3eb288a..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
if (trans)
return do_setxattr(trans, inode, name, value, size, flags);
- ret = btrfs_reserve_metadata_space(root, 2);
- if (ret)
- return ret;
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto out;
- }
btrfs_set_trans_block_group(trans, inode);
ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
BUG_ON(ret);
out:
btrfs_end_transaction_throttle(trans, root);
- btrfs_unreserve_metadata_space(root, 2);
return ret;
}
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..f0b391c50552 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -568,6 +568,79 @@ out:
return ret;
}
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+
+ssize_t compat_rw_copy_check_uvector(int type,
+ const struct compat_iovec __user *uvector, unsigned long nr_segs,
+ unsigned long fast_segs, struct iovec *fast_pointer,
+ struct iovec **ret_pointer)
+{
+ compat_ssize_t tot_len;
+ struct iovec *iov = *ret_pointer = fast_pointer;
+ ssize_t ret = 0;
+ int seg;
+
+ /*
+ * SuS says "The readv() function *may* fail if the iovcnt argument
+ * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
+ * traditionally returned zero for zero segments, so...
+ */
+ if (nr_segs == 0)
+ goto out;
+
+ ret = -EINVAL;
+ if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+ goto out;
+ if (nr_segs > fast_segs) {
+ ret = -ENOMEM;
+ iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+ if (iov == NULL) {
+ *ret_pointer = fast_pointer;
+ goto out;
+ }
+ }
+ *ret_pointer = iov;
+
+ /*
+ * Single unix specification:
+ * We should -EINVAL if an element length is not >= 0 and fitting an
+ * ssize_t. The total length is fitting an ssize_t
+ *
+ * Be careful here because iov_len is a size_t not an ssize_t
+ */
+ tot_len = 0;
+ ret = -EINVAL;
+ for (seg = 0; seg < nr_segs; seg++) {
+ compat_ssize_t tmp = tot_len;
+ compat_uptr_t buf;
+ compat_ssize_t len;
+
+ if (__get_user(len, &uvector->iov_len) ||
+ __get_user(buf, &uvector->iov_base)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (len < 0) /* size_t not fitting in compat_ssize_t .. */
+ goto out;
+ tot_len += len;
+ if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
+ goto out;
+ if (!access_ok(vrfy_dir(type), buf, len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ iov->iov_base = compat_ptr(buf);
+ iov->iov_len = (compat_size_t) len;
+ uvector++;
+ iov++;
+ }
+ ret = tot_len;
+
+out:
+ return ret;
+}
+
static inline long
copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
{
@@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
ret = copy_iocb(nr, iocb, iocb64);
if (!ret)
- ret = sys_io_submit(ctx_id, nr, iocb64);
+ ret = do_io_submit(ctx_id, nr, iocb64, 1);
return ret;
}
@@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
{
compat_ssize_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov=iovstack, *vector;
+ struct iovec *iov;
ssize_t ret;
- int seg;
io_fn_t fn;
iov_fn_t fnv;
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- ret = 0;
- if (nr_segs == 0)
- goto out;
-
- /*
- * First get the "struct iovec" from user memory and
- * verify all the pointers
- */
ret = -EINVAL;
- if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
- goto out;
if (!file->f_op)
goto out;
- if (nr_segs > UIO_FASTIOV) {
- ret = -ENOMEM;
- iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
- if (!iov)
- goto out;
- }
+
ret = -EFAULT;
if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
goto out;
- /*
- * Single unix specification:
- * We should -EINVAL if an element length is not >= 0 and fitting an
- * ssize_t. The total length is fitting an ssize_t
- *
- * Be careful here because iov_len is a size_t not an ssize_t
- */
- tot_len = 0;
- vector = iov;
- ret = -EINVAL;
- for (seg = 0 ; seg < nr_segs; seg++) {
- compat_ssize_t tmp = tot_len;
- compat_ssize_t len;
- compat_uptr_t buf;
-
- if (__get_user(len, &uvector->iov_len) ||
- __get_user(buf, &uvector->iov_base)) {
- ret = -EFAULT;
- goto out;
- }
- if (len < 0) /* size_t not fitting an compat_ssize_t .. */
- goto out;
- tot_len += len;
- if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
- goto out;
- vector->iov_base = compat_ptr(buf);
- vector->iov_len = (compat_size_t) len;
- uvector++;
- vector++;
- }
+ tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
+ UIO_FASTIOV, iovstack, &iov);
if (tot_len == 0) {
ret = 0;
goto out;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..da111aacb46e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
int reap_counter; /* rate limit reaping */
get_block_t *get_block; /* block mapping function */
dio_iodone_t *end_io; /* IO completion function */
+ dio_submit_t *submit_io; /* IO submition function */
+ loff_t logical_offset_in_bio; /* current first logical block in bio */
sector_t final_block_in_bio; /* current final block in bio + 1 */
sector_t next_block_for_io; /* next block to be put under IO,
in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
unsigned cur_page_offset; /* Offset into it, in bytes */
unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
sector_t cur_page_block; /* Where it starts */
+ loff_t cur_page_fs_offset; /* Offset in file */
/* BIO completion state */
spinlock_t bio_lock; /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
}
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+ struct dio *dio = bio->bi_private;
+
+ if (dio->is_async)
+ dio_bio_end_aio(bio, error);
+ else
+ dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
+
static int
dio_bio_alloc(struct dio *dio, struct block_device *bdev,
sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
bio->bi_end_io = dio_bio_end_io;
dio->bio = bio;
+ dio->logical_offset_in_bio = dio->cur_page_fs_offset;
return 0;
}
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
if (dio->is_async && dio->rw == READ)
bio_set_pages_dirty(bio);
- submit_bio(dio->rw, bio);
+ if (dio->submit_io)
+ dio->submit_io(dio->rw, bio, dio->inode,
+ dio->logical_offset_in_bio);
+ else
+ submit_bio(dio->rw, bio);
dio->bio = NULL;
dio->boundary = 0;
+ dio->logical_offset_in_bio = 0;
}
/*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
int ret = 0;
if (dio->bio) {
+ loff_t cur_offset = dio->block_in_file << dio->blkbits;
+ loff_t bio_next_offset = dio->logical_offset_in_bio +
+ dio->bio->bi_size;
+
/*
- * See whether this new request is contiguous with the old
+ * See whether this new request is contiguous with the old.
+ *
+ * Btrfs cannot handl having logically non-contiguous requests
+ * submitted. For exmple if you have
+ *
+ * Logical: [0-4095][HOLE][8192-12287]
+ * Phyiscal: [0-4095] [4096-8181]
+ *
+ * We cannot submit those pages together as one BIO. So if our
+ * current logical offset in the file does not equal what would
+ * be the next logical offset in the bio, submit the bio we
+ * have.
*/
- if (dio->final_block_in_bio != dio->cur_page_block)
+ if (dio->final_block_in_bio != dio->cur_page_block ||
+ cur_offset != bio_next_offset)
dio_bio_submit(dio);
/*
* Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
dio->cur_page_offset = offset;
dio->cur_page_len = len;
dio->cur_page_block = blocknr;
+ dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
out:
return ret;
}
@@ -935,7 +981,7 @@ static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
const struct iovec *iov, loff_t offset, unsigned long nr_segs,
unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
- struct dio *dio)
+ dio_submit_t submit_io, struct dio *dio)
{
unsigned long user_addr;
unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
dio->get_block = get_block;
dio->end_io = end_io;
+ dio->submit_io = submit_io;
dio->final_block_in_bio = -1;
dio->next_block_for_io = -1;
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
}
} /* end iovec loop */
- if (ret == -ENOTBLK && (rw & WRITE)) {
+ if (ret == -ENOTBLK) {
/*
* The remaining part of the request will be
* be handled by buffered I/O when we return
@@ -1110,7 +1157,7 @@ ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, const struct iovec *iov, loff_t offset,
unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
- int flags)
+ dio_submit_t submit_io, int flags)
{
int seg;
size_t size;
@@ -1197,7 +1244,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
(end > i_size_read(inode)));
retval = direct_io_worker(rw, iocb, inode, iov, offset,
- nr_segs, blkbits, get_block, end_io, dio);
+ nr_segs, blkbits, get_block, end_io,
+ submit_io, dio);
/*
* In case of error extending write may have instantiated a few
diff --git a/fs/exec.c b/fs/exec.c
index 9badbc0bfb1d..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -768,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
- int count;
if (thread_group_empty(tsk))
goto no_thread_group;
@@ -785,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
spin_unlock_irq(lock);
return -EAGAIN;
}
+
sig->group_exit_task = tsk;
- zap_other_threads(tsk);
+ sig->notify_count = zap_other_threads(tsk);
+ if (!thread_group_leader(tsk))
+ sig->notify_count--;
- /* Account for the thread group leader hanging around: */
- count = thread_group_leader(tsk) ? 1 : 2;
- sig->notify_count = count;
- while (atomic_read(&sig->count) > count) {
+ while (sig->notify_count) {
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock_irq(lock);
schedule();
@@ -1662,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
struct completion *vfork_done;
- int core_waiters;
+ int core_waiters = -EBUSY;
init_completion(&core_state->startup);
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
- core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+
+ down_write(&mm->mmap_sem);
+ if (!mm->core_state)
+ core_waiters = zap_threads(tsk, mm, core_state, exit_code);
up_write(&mm->mmap_sem);
if (unlikely(core_waiters < 0))
@@ -1787,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
}
+/*
+ * uhm_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace. Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process. Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1. This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info)
+{
+ struct file *rp, *wp;
+ struct fdtable *fdt;
+ struct coredump_params *cp = (struct coredump_params *)info->data;
+ struct files_struct *cf = current->files;
+
+ wp = create_write_pipe(0);
+ if (IS_ERR(wp))
+ return PTR_ERR(wp);
+
+ rp = create_read_pipe(wp, 0);
+ if (IS_ERR(rp)) {
+ free_write_pipe(wp);
+ return PTR_ERR(rp);
+ }
+
+ cp->file = wp;
+
+ sys_close(0);
+ fd_install(0, rp);
+ spin_lock(&cf->file_lock);
+ fdt = files_fdtable(cf);
+ FD_SET(0, fdt->open_fds);
+ FD_CLR(0, fdt->close_on_exec);
+ spin_unlock(&cf->file_lock);
+
+ /* and disallow core files too */
+ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+ return 0;
+}
+
void do_coredump(long signr, int exit_code, struct pt_regs *regs)
{
struct core_state core_state;
char corename[CORENAME_MAX_SIZE + 1];
struct mm_struct *mm = current->mm;
struct linux_binfmt * binfmt;
- struct inode * inode;
const struct cred *old_cred;
struct cred *cred;
int retval = 0;
int flag = 0;
- int ispipe = 0;
- char **helper_argv = NULL;
- int helper_argc = 0;
- int dump_count = 0;
+ int ispipe;
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.signr = signr,
@@ -1820,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
binfmt = mm->binfmt;
if (!binfmt || !binfmt->core_dump)
goto fail;
-
- cred = prepare_creds();
- if (!cred) {
- retval = -ENOMEM;
+ if (!__get_dumpable(cprm.mm_flags))
goto fail;
- }
- down_write(&mm->mmap_sem);
- /*
- * If another thread got here first, or we are not dumpable, bail out.
- */
- if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
- up_write(&mm->mmap_sem);
- put_cred(cred);
+ cred = prepare_creds();
+ if (!cred)
goto fail;
- }
-
/*
* We cannot trust fsuid as being the "true" uid of the
* process nor do we know its entire history. We only know it
@@ -1849,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
}
retval = coredump_wait(exit_code, &core_state);
- if (retval < 0) {
- put_cred(cred);
- goto fail;
- }
+ if (retval < 0)
+ goto fail_creds;
old_cred = override_creds(cred);
@@ -1870,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
ispipe = format_corename(corename, signr);
unlock_kernel();
- if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
- goto fail_unlock;
-
if (ispipe) {
- if (cprm.limit == 0) {
+ int dump_count;
+ char **helper_argv;
+
+ if (cprm.limit == 1) {
/*
* Normally core limits are irrelevant to pipes, since
* we're not writing to the file system, but we use
- * cprm.limit of 0 here as a speacial value. Any
- * non-zero limit gets set to RLIM_INFINITY below, but
+ * cprm.limit of 1 here as a speacial value. Any
+ * non-1 limit gets set to RLIM_INFINITY below, but
* a limit of 0 skips the dump. This is a consistent
* way to catch recursive crashes. We can still crash
- * if the core_pattern binary sets RLIM_CORE = !0
+ * if the core_pattern binary sets RLIM_CORE = !1
* but it runs as root, and can do lots of stupid things
* Note that we use task_tgid_vnr here to grab the pid
* of the process group leader. That way we get the
@@ -1890,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
* core_pattern process dies.
*/
printk(KERN_WARNING
- "Process %d(%s) has RLIMIT_CORE set to 0\n",
+ "Process %d(%s) has RLIMIT_CORE set to 1\n",
task_tgid_vnr(current), current->comm);
printk(KERN_WARNING "Aborting core\n");
goto fail_unlock;
}
+ cprm.limit = RLIM_INFINITY;
dump_count = atomic_inc_return(&core_dump_count);
if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1904,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
goto fail_dropcount;
}
- helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+ helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
if (!helper_argv) {
printk(KERN_WARNING "%s failed to allocate memory\n",
__func__);
goto fail_dropcount;
}
- cprm.limit = RLIM_INFINITY;
-
- /* SIGPIPE can happen, but it's just never processed */
- if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
- &cprm.file)) {
+ retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+ NULL, UMH_WAIT_EXEC, umh_pipe_setup,
+ NULL, &cprm);
+ argv_free(helper_argv);
+ if (retval) {
printk(KERN_INFO "Core dump to %s pipe failed\n",
corename);
- goto fail_dropcount;
+ goto close_fail;
}
- } else
+ } else {
+ struct inode *inode;
+
+ if (cprm.limit < binfmt->min_coredump)
+ goto fail_unlock;
+
cprm.file = filp_open(corename,
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
0600);
- if (IS_ERR(cprm.file))
- goto fail_dropcount;
- inode = cprm.file->f_path.dentry->d_inode;
- if (inode->i_nlink > 1)
- goto close_fail; /* multiple links - don't dump */
- if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
- goto close_fail;
-
- /* AK: actually i see no reason to not allow this for named pipes etc.,
- but keep the previous behaviour for now. */
- if (!ispipe && !S_ISREG(inode->i_mode))
- goto close_fail;
- /*
- * Dont allow local users get cute and trick others to coredump
- * into their pre-created files:
- * Note, this is not relevant for pipes
- */
- if (!ispipe && (inode->i_uid != current_fsuid()))
- goto close_fail;
- if (!cprm.file->f_op)
- goto close_fail;
- if (!cprm.file->f_op->write)
- goto close_fail;
- if (!ispipe &&
- do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
- goto close_fail;
+ if (IS_ERR(cprm.file))
+ goto fail_unlock;
- retval = binfmt->core_dump(&cprm);
+ inode = cprm.file->f_path.dentry->d_inode;
+ if (inode->i_nlink > 1)
+ goto close_fail;
+ if (d_unhashed(cprm.file->f_path.dentry))
+ goto close_fail;
+ /*
+ * AK: actually i see no reason to not allow this for named
+ * pipes etc, but keep the previous behaviour for now.
+ */
+ if (!S_ISREG(inode->i_mode))
+ goto close_fail;
+ /*
+ * Dont allow local users get cute and trick others to coredump
+ * into their pre-created files.
+ */
+ if (inode->i_uid != current_fsuid())
+ goto close_fail;
+ if (!cprm.file->f_op || !cprm.file->f_op->write)
+ goto close_fail;
+ if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+ goto close_fail;
+ }
+ retval = binfmt->core_dump(&cprm);
if (retval)
current->signal->group_exit_code |= 0x80;
-close_fail:
+
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
- filp_close(cprm.file, NULL);
+close_fail:
+ if (cprm.file)
+ filp_close(cprm.file, NULL);
fail_dropcount:
- if (dump_count)
+ if (ispipe)
atomic_dec(&core_dump_count);
fail_unlock:
- if (helper_argv)
- argv_free(helper_argv);
-
+ coredump_finish(mm);
revert_creds(old_cred);
+fail_creds:
put_cred(cred);
- coredump_finish(mm);
fail:
return;
}
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ret = ext4_mb_new_blocks(handle, &ar, errp);
if (count)
*count = ar.len;
-
/*
- * Account for the allocated meta blocks
+ * Account for the allocated meta blocks. We will never
+ * fail EDQUOT for metdata, but we do account for it.
*/
if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ dquot_alloc_block_nofail(inode, ar.len);
}
return ret;
}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
else if (start_blk >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
else {
- if (start_blk + count > (entry->start_blk +
+ if (start_blk + count > (entry->start_blk +
entry->count))
- entry->count = (start_blk + count -
+ entry->count = (start_blk + count -
entry->start_blk);
new_node = *n;
new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
error_msg = "inode out of bounds";
if (error_msg != NULL)
- __ext4_error(dir->i_sb, function,
- "bad entry in directory #%lu: %s - block=%llu"
+ ext4_error_inode(function, dir,
+ "bad entry in directory: %s - block=%llu"
"offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
- dir->i_ino, error_msg,
- (unsigned long long) bh->b_blocknr,
+ error_msg, (unsigned long long) bh->b_blocknr,
(unsigned) (offset%bh->b_size), offset,
le32_to_cpu(de->inode),
rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
- ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1))) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
* We don't set the inode dirty flag since it's not
* critical that it get flushed back to the disk.
*/
- EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
}
stored = 0;
offset = filp->f_pos & (sb->s_blocksize - 1);
while (!error && !stored && filp->f_pos < inode->i_size) {
- ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
- struct buffer_head map_bh;
+ struct ext4_map_blocks map;
struct buffer_head *bh = NULL;
- map_bh.b_state = 0;
- err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
+ map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+ map.m_len = 1;
+ err = ext4_map_blocks(NULL, inode, &map, 0);
if (err > 0) {
- pgoff_t index = map_bh.b_blocknr >>
+ pgoff_t index = map.m_pblk >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
if (!ra_has_index(&filp->f_ra, index))
page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
&filp->f_ra, filp,
index, 1);
filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
- bh = ext4_bread(NULL, inode, blk, 0, &err);
+ bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
}
/*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
*/
if (!bh) {
if (!dir_has_error) {
- ext4_error(sb, "directory #%lu "
+ EXT4_ERROR_INODE(inode, "directory "
"contains a hole at offset %Lu",
- inode->i_ino,
(unsigned long long) filp->f_pos);
dir_has_error = 1;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..60bd31026e7c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
/*
* The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
#endif
#define EXT4_ERROR_INODE(inode, fmt, a...) \
- ext4_error_inode(__func__, (inode), (fmt), ## a);
+ ext4_error_inode(__func__, (inode), (fmt), ## a)
#define EXT4_ERROR_FILE(file, fmt, a...) \
- ext4_error_file(__func__, (file), (fmt), ## a);
+ ext4_error_file(__func__, (file), (fmt), ## a)
/* data type for block offset of block group */
typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
typedef unsigned int ext4_group_t;
/*
- * Flags used in mballoc's allocation_context flags field.
+ * Flags used in mballoc's allocation_context flags field.
*
* Also used to show what's going on for debugging purposes when the
* flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
};
/*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks(). It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW (1 << BH_New)
+#define EXT4_MAP_MAPPED (1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
+#define EXT4_MAP_UNINIT (1 << BH_Uninit)
+#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_UNINIT)
+
+struct ext4_map_blocks {
+ ext4_fsblk_t m_pblk;
+ ext4_lblk_t m_lblk;
+ unsigned int m_len;
+ unsigned int m_flags;
+};
+
+/*
* For delayed allocation tracking
*/
struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
return flags & EXT4_OTHER_FLMASK;
}
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+ EXT4_INODE_SECRM = 0, /* Secure deletion */
+ EXT4_INODE_UNRM = 1, /* Undelete */
+ EXT4_INODE_COMPR = 2, /* Compress file */
+ EXT4_INODE_SYNC = 3, /* Synchronous updates */
+ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
+ EXT4_INODE_APPEND = 5, /* writes to file may only append */
+ EXT4_INODE_NODUMP = 6, /* do not dump file */
+ EXT4_INODE_NOATIME = 7, /* do not update atime */
+/* Reserved for compression usage... */
+ EXT4_INODE_DIRTY = 8,
+ EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
+ EXT4_INODE_NOCOMPR = 10, /* Don't compress */
+ EXT4_INODE_ECOMPR = 11, /* Compression error */
+/* End compression flags --- maybe not all used */
+ EXT4_INODE_INDEX = 12, /* hash-indexed directory */
+ EXT4_INODE_IMAGIC = 13, /* AFS directory */
+ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
+ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
+ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
+ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
+ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
+ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
+ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
+ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
+};
+
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+ printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+ EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+ CHECK_FLAG_VALUE(SECRM);
+ CHECK_FLAG_VALUE(UNRM);
+ CHECK_FLAG_VALUE(COMPR);
+ CHECK_FLAG_VALUE(SYNC);
+ CHECK_FLAG_VALUE(IMMUTABLE);
+ CHECK_FLAG_VALUE(APPEND);
+ CHECK_FLAG_VALUE(NODUMP);
+ CHECK_FLAG_VALUE(NOATIME);
+ CHECK_FLAG_VALUE(DIRTY);
+ CHECK_FLAG_VALUE(COMPRBLK);
+ CHECK_FLAG_VALUE(NOCOMPR);
+ CHECK_FLAG_VALUE(ECOMPR);
+ CHECK_FLAG_VALUE(INDEX);
+ CHECK_FLAG_VALUE(IMAGIC);
+ CHECK_FLAG_VALUE(JOURNAL_DATA);
+ CHECK_FLAG_VALUE(NOTAIL);
+ CHECK_FLAG_VALUE(DIRSYNC);
+ CHECK_FLAG_VALUE(TOPDIR);
+ CHECK_FLAG_VALUE(HUGE_FILE);
+ CHECK_FLAG_VALUE(EXTENTS);
+ CHECK_FLAG_VALUE(EA_INODE);
+ CHECK_FLAG_VALUE(EOFBLOCKS);
+ CHECK_FLAG_VALUE(RESERVED);
+}
+
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
__u32 group; /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
__u16 unused;
};
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+ u32 group;
+ compat_u64 block_bitmap;
+ compat_u64 inode_bitmap;
+ compat_u64 inode_table;
+ u32 blocks_count;
+ u16 reserved_blocks;
+ u16 unused;
+};
+#endif
+
/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
__u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
EXT4_GET_BLOCKS_CREATE)
/* Caller is from the delayed allocation writeout path,
- so set the magic i_delalloc_reserve_flag after taking the
+ so set the magic i_delalloc_reserve_flag after taking the
inode allocation semaphore for */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
/* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
* ioctl commands in 32 bit emulation
*/
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
#ifdef CONFIG_JBD2_DEBUG
#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
#endif
#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
+#endif
/*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
- __u32 i_flags;
- ext4_fsblk_t i_file_acl;
__u32 i_dtime;
+ ext4_fsblk_t i_file_acl;
/*
* i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
*/
ext4_group_t i_block_group;
unsigned long i_state_flags; /* Dynamic state flags */
+ unsigned long i_flags;
ext4_lblk_t i_dir_start_lookup;
#ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
+ EXT4_STATE_NEWENTRY, /* File just added to dir */
};
-static inline int ext4_test_inode_state(struct inode *inode, int bit)
-{
- return test_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
-
-static inline void ext4_set_inode_state(struct inode *inode, int bit)
-{
- set_bit(bit, &EXT4_I(inode)->i_state_flags);
+#define EXT4_INODE_BIT_FNS(name, field) \
+static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
+{ \
+ return test_bit(bit, &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
+{ \
+ set_bit(bit, &EXT4_I(inode)->i_##field); \
+} \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{ \
+ clear_bit(bit, &EXT4_I(inode)->i_##field); \
}
-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
-{
- clear_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
+EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(state, state_flags)
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
* a kernel struct super_block. This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
EXT4_FEATURE_COMPAT_DIR_INDEX) && \
- (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+ ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
ext4_grpblk_t bb_first_free; /* first free block */
ext4_grpblk_t bb_free; /* total free blocks */
ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
+ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
struct list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
void *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
int chunk);
-extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned int max_blocks,
- struct buffer_head *bh_result, int flags);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
sector_t block, unsigned int max_blocks,
struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
return 1;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
return 1;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 1;
return 0;
}
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
return 0;
if (EXT4_JOURNAL(inode) == NULL)
return 1;
- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
return 0;
if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return 0;
if (ext4_should_journal_data(inode))
return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca8..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
if (err <= 0)
return err;
err = ext4_truncate_restart_trans(handle, inode, needed);
- /*
- * We have dropped i_data_sem so someone might have cached again
- * an extent we are going to truncate.
- */
- ext4_ext_invalidate_cache(inode);
+ if (err == 0)
+ err = -EAGAIN;
return err;
}
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
/*
* If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
- * block groups per flexgroup, reserve the first block
- * group for directories and special files. Regular
+ * block groups per flexgroup, reserve the first block
+ * group for directories and special files. Regular
* files will start at the second block group. This
- * tends to speed up directory access and improves
+ * tends to speed up directory access and improves
* fsck times.
*/
block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
return 0;
corrupted:
- __ext4_error(inode->i_sb, function,
- "bad header/extent in inode #%lu: %s - magic %x, "
+ ext4_error_inode(function, inode,
+ "bad header/extent: %s - magic %x, "
"entries %u, max %u(%u), depth %u(%u)",
- inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+ error_msg, le16_to_cpu(eh->eh_magic),
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
max, le16_to_cpu(eh->eh_depth), depth);
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
merge_done = 1;
WARN_ON(eh->eh_entries == 0);
if (!eh->eh_entries)
- ext4_error(inode->i_sb,
- "inode#%lu, eh->eh_entries = 0!",
- inode->i_ino);
+ EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
}
return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
struct ext4_ext_cache *cex;
int ret = EXT4_EXT_CACHE_NO;
- /*
+ /*
* We borrow i_block_reservation_lock to protect i_cached_extent
*/
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
int depth = ext_depth(inode);
struct ext4_ext_path *path;
handle_t *handle;
- int i = 0, err = 0;
+ int i, err;
ext_debug("truncate since %u\n", start);
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
if (IS_ERR(handle))
return PTR_ERR(handle);
+again:
ext4_ext_invalidate_cache(inode);
/*
* We start scanning from right side, freeing all the blocks
* after i_size and walking into the tree depth-wise.
*/
+ depth = ext_depth(inode);
path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
if (path == NULL) {
ext4_journal_stop(handle);
return -ENOMEM;
}
+ path[0].p_depth = depth;
path[0].p_hdr = ext_inode_hdr(inode);
if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
err = -EIO;
goto out;
}
- path[0].p_depth = depth;
+ i = err = 0;
while (i >= 0 && err == 0) {
if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
out:
ext4_ext_drop_refs(path);
kfree(path);
+ if (err == -EAGAIN)
+ goto again;
ext4_journal_stop(handle);
return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
/* FIXME!! we need to try to merge to left or right after zero-out */
static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
{
- int ret = -EIO;
+ int ret;
struct bio *bio;
int blkbits, blocksize;
sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
len = ee_len;
bio = bio_alloc(GFP_NOIO, len);
+ if (!bio)
+ return -ENOMEM;
+
bio->bi_sector = ee_pblock;
bio->bi_bdev = inode->i_sb->s_bdev;
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
submit_bio(WRITE, bio);
wait_for_completion(&event);
- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
- ret = 0;
- else {
- ret = -EIO;
- break;
+ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ bio_put(bio);
+ return -EIO;
}
bio_put(bio);
ee_len -= done;
ee_pblock += done << (blkbits - 9);
}
- return ret;
+ return 0;
}
#define EXT4_EXT_ZERO_LEN 7
/*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
* to an uninitialized extent. It may result in splitting the uninitialized
* extent into multiple extents (upto three - one initialized and two
* uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
* c> Splits in three extents: Somone is writing in middle of the extent
*/
static int ext4_ext_convert_to_initialized(handle_t *handle,
- struct inode *inode,
- struct ext4_ext_path *path,
- ext4_lblk_t iblock,
- unsigned int max_blocks)
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
{
struct ext4_extent *ex, newex, orig_ex;
struct ext4_extent *ex1 = NULL;
struct ext4_extent *ex2 = NULL;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
int ret = 0;
+ int may_zeroout;
+
+ ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < map->m_lblk + map->m_len)
+ eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (iblock - ee_block);
- newblock = iblock - ee_block + ext_pblock(ex);
+ allocated = ee_len - (map->m_lblk - ee_block);
+ newblock = map->m_lblk - ee_block + ext_pblock(ex);
+
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+ /*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ may_zeroout = ee_block + ee_len <= eof_block;
+
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+ if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
return allocated;
}
- /* ex1: ee_block to iblock - 1 : uninitialized */
- if (iblock > ee_block) {
+ /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+ if (map->m_lblk > ee_block) {
ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* we insert ex3, if ex1 is NULL. This is to avoid temporary
* overlap of blocks.
*/
- if (!ex1 && allocated > max_blocks)
- ex2->ee_len = cpu_to_le16(max_blocks);
+ if (!ex1 && allocated > map->m_len)
+ ex2->ee_len = cpu_to_le16(map->m_len);
/* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > max_blocks) {
+ if (allocated > map->m_len) {
unsigned int newdepth;
/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
- if (allocated <= EXT4_EXT_ZERO_LEN) {
+ if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
/*
- * iblock == ee_block is handled by the zerouout
+ * map->m_lblk == ee_block is handled by the zerouout
* at the beginning.
* Mark first half uninitialized.
* Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_dirty(handle, inode, path + depth);
ex3 = &newex;
- ex3->ee_block = cpu_to_le32(iblock);
+ ex3->ee_block = cpu_to_le32(map->m_lblk);
ext4_ext_store_pblock(ex3, newblock);
ex3->ee_len = cpu_to_le16(allocated);
err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
- /* blocks available from iblock */
+ /* blocks available from map->m_lblk */
return allocated;
} else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
depth = ext_depth(inode);
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode,
- iblock, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk,
+ path);
if (IS_ERR(path)) {
err = PTR_ERR(path);
return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
return allocated;
}
ex3 = &newex;
- ex3->ee_block = cpu_to_le32(iblock + max_blocks);
- ext4_ext_store_pblock(ex3, newblock + max_blocks);
- ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+ ext4_ext_store_pblock(ex3, newblock + map->m_len);
+ ex3->ee_len = cpu_to_le16(allocated - map->m_len);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */
- /* blocks available from iblock */
+ /* blocks available from map->m_lblk */
return allocated;
} else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* update the extent length after successful insert of the
* split extent
*/
- orig_ex.ee_len = cpu_to_le16(ee_len -
- ext4_ext_get_actual_len(ex3));
+ ee_len -= ext4_ext_get_actual_len(ex3);
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ may_zeroout = ee_block + ee_len <= eof_block;
+
depth = newdepth;
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, iblock, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
if (err)
goto out;
- allocated = max_blocks;
+ allocated = map->m_len;
/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
* to insert a extent in the middle zerout directly
* otherwise give the extent a chance to merge to left
*/
if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
- iblock != ee_block) {
+ map->m_lblk != ee_block && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zero out the first half */
- /* blocks available from iblock */
+ /* blocks available from map->m_lblk */
return allocated;
}
}
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
if (ex1 && ex1 != ex) {
ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
- /* ex2: iblock to iblock + maxblocks-1 : initialised */
- ex2->ee_block = cpu_to_le32(iblock);
+ /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
+ ex2->ee_block = cpu_to_le32(map->m_lblk);
ext4_ext_store_pblock(ex2, newblock);
ex2->ee_len = cpu_to_le16(allocated);
if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
}
/*
- * This function is called by ext4_ext_get_blocks() from
+ * This function is called by ext4_ext_map_blocks() from
* ext4_get_blocks_dio_write() when DIO to write
* to an uninitialized extent.
*
@@ -2927,9 +2946,8 @@ fix_extent_len:
*/
static int ext4_split_unwritten_extents(handle_t *handle,
struct inode *inode,
+ struct ext4_map_blocks *map,
struct ext4_ext_path *path,
- ext4_lblk_t iblock,
- unsigned int max_blocks,
int flags)
{
struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
struct ext4_extent *ex2 = NULL;
struct ext4_extent *ex3 = NULL;
struct ext4_extent_header *eh;
- ext4_lblk_t ee_block;
+ ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
ext4_fsblk_t newblock;
int err = 0;
+ int may_zeroout;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
+
+ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+ inode->i_sb->s_blocksize_bits;
+ if (eof_block < map->m_lblk + map->m_len)
+ eof_block = map->m_lblk + map->m_len;
- ext_debug("ext4_split_unwritten_extents: inode %lu,"
- "iblock %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)iblock, max_blocks);
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (iblock - ee_block);
- newblock = iblock - ee_block + ext_pblock(ex);
+ allocated = ee_len - (map->m_lblk - ee_block);
+ newblock = map->m_lblk - ee_block + ext_pblock(ex);
+
ex2 = ex;
orig_ex.ee_block = ex->ee_block;
orig_ex.ee_len = cpu_to_le16(ee_len);
ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
/*
+ * It is safe to convert extent to initialized via explicit
+ * zeroout only if extent is fully insde i_size or new_size.
+ */
+ may_zeroout = ee_block + ee_len <= eof_block;
+
+ /*
* If the uninitialized extent begins at the same logical
* block where the write begins, and the write completely
* covers the extent, then we don't need to split it.
*/
- if ((iblock == ee_block) && (allocated <= max_blocks))
+ if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
return allocated;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
- /* ex1: ee_block to iblock - 1 : uninitialized */
- if (iblock > ee_block) {
+ /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+ if (map->m_lblk > ee_block) {
ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
* we insert ex3, if ex1 is NULL. This is to avoid temporary
* overlap of blocks.
*/
- if (!ex1 && allocated > max_blocks)
- ex2->ee_len = cpu_to_le16(max_blocks);
+ if (!ex1 && allocated > map->m_len)
+ ex2->ee_len = cpu_to_le16(map->m_len);
/* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > max_blocks) {
+ if (allocated > map->m_len) {
unsigned int newdepth;
ex3 = &newex;
- ex3->ee_block = cpu_to_le32(iblock + max_blocks);
- ext4_ext_store_pblock(ex3, newblock + max_blocks);
- ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+ ext4_ext_store_pblock(ex3, newblock + map->m_len);
+ ex3->ee_len = cpu_to_le16(allocated - map->m_len);
ext4_ext_mark_uninitialized(ex3);
err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */
- /* blocks available from iblock */
+ /* blocks available from map->m_lblk */
return allocated;
} else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
* update the extent length after successful insert of the
* split extent
*/
- orig_ex.ee_len = cpu_to_le16(ee_len -
- ext4_ext_get_actual_len(ex3));
+ ee_len -= ext4_ext_get_actual_len(ex3);
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ may_zeroout = ee_block + ee_len <= eof_block;
+
depth = newdepth;
ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, iblock, path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
if (err)
goto out;
- allocated = max_blocks;
+ allocated = map->m_len;
}
/*
* If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
*/
if (ex1 && ex1 != ex) {
ex1 = ex;
- ex1->ee_len = cpu_to_le16(iblock - ee_block);
+ ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
ext4_ext_mark_uninitialized(ex1);
ex2 = &newex;
}
/*
- * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
- * uninitialised still.
+ * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
+ * using direct I/O, uninitialised still.
*/
- ex2->ee_block = cpu_to_le32(iblock);
+ ex2->ee_block = cpu_to_le32(map->m_lblk);
ext4_ext_store_pblock(ex2, newblock);
ex2->ee_len = cpu_to_le16(allocated);
ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
goto out;
insert:
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
- if (err == -ENOSPC) {
+ if (err == -ENOSPC && may_zeroout) {
err = ext4_ext_zeroout(inode, &orig_ex);
if (err)
goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
static int
ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned int max_blocks,
+ struct ext4_map_blocks *map,
struct ext4_ext_path *path, int flags,
- unsigned int allocated, struct buffer_head *bh_result,
- ext4_fsblk_t newblock)
+ unsigned int allocated, ext4_fsblk_t newblock)
{
int ret = 0;
int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
"block %llu, max_blocks %u, flags %d, allocated %u",
- inode->i_ino, (unsigned long long)iblock, max_blocks,
+ inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
flags, allocated);
ext4_ext_show_leaf(inode, path);
/* get_block() before submit the IO, split the extent */
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- ret = ext4_split_unwritten_extents(handle,
- inode, path, iblock,
- max_blocks, flags);
+ ret = ext4_split_unwritten_extents(handle, inode, map,
+ path, flags);
/*
* Flag the inode(non aio case) or end_io struct (aio case)
* that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
if (ext4_should_dioread_nolock(inode))
- set_buffer_uninit(bh_result);
+ map->m_flags |= EXT4_MAP_UNINIT;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* the buffer head will be unmapped so that
* a read from the block returns 0s.
*/
- set_buffer_unwritten(bh_result);
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out1;
}
/* buffered write, writepage time, convert*/
- ret = ext4_ext_convert_to_initialized(handle, inode,
- path, iblock,
- max_blocks);
+ ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out:
@@ -3226,7 +3256,7 @@ out:
goto out2;
} else
allocated = ret;
- set_buffer_new(bh_result);
+ map->m_flags |= EXT4_MAP_NEW;
/*
* if we allocated more blocks than requested
* we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
* unmapped later when we find the buffer_head marked
* new.
*/
- if (allocated > max_blocks) {
+ if (allocated > map->m_len) {
unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
- newblock + max_blocks,
- allocated - max_blocks);
- allocated = max_blocks;
+ newblock + map->m_len,
+ allocated - map->m_len);
+ allocated = map->m_len;
}
/*
@@ -3252,13 +3282,13 @@ out:
ext4_da_update_reserve_space(inode, allocated, 0);
map_out:
- set_buffer_mapped(bh_result);
+ map->m_flags |= EXT4_MAP_MAPPED;
out1:
- if (allocated > max_blocks)
- allocated = max_blocks;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
+ map->m_pblk = newblock;
+ map->m_len = allocated;
out2:
if (path) {
ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
*
* return < 0, error case.
*/
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock,
- unsigned int max_blocks, struct buffer_head *bh_result,
- int flags)
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
struct ext4_extent_header *eh;
struct ext4_extent newex, *ex, *last_ex;
ext4_fsblk_t newblock;
- int err = 0, depth, ret, cache_type;
+ int i, err = 0, depth, ret, cache_type;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
- __clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%u requested for inode %lu\n",
- iblock, max_blocks, inode->i_ino);
+ map->m_lblk, map->m_len, inode->i_ino);
/* check in cache */
- cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+ cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
if (cache_type) {
if (cache_type == EXT4_EXT_CACHE_GAP) {
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* we should allocate requested block */
} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
/* block is already allocated */
- newblock = iblock
+ newblock = map->m_lblk
- le32_to_cpu(newex.ee_block)
+ ext_pblock(&newex);
/* number of remaining blocks in the extent */
allocated = ext4_ext_get_actual_len(&newex) -
- (iblock - le32_to_cpu(newex.ee_block));
+ (map->m_lblk - le32_to_cpu(newex.ee_block));
goto out;
} else {
BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
}
/* find extent for this block */
- path = ext4_ext_find_extent(inode, iblock, NULL);
+ path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
if (IS_ERR(path)) {
err = PTR_ERR(path);
path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
EXT4_ERROR_INODE(inode, "bad extent address "
- "iblock: %d, depth: %d pblock %lld",
- iblock, depth, path[depth].p_block);
+ "lblock: %lu, depth: %d pblock %lld",
+ (unsigned long) map->m_lblk, depth,
+ path[depth].p_block);
err = -EIO;
goto out2;
}
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
ee_len = ext4_ext_get_actual_len(ex);
/* if found extent covers block, simply return it */
- if (in_range(iblock, ee_block, ee_len)) {
- newblock = iblock - ee_block + ee_start;
+ if (in_range(map->m_lblk, ee_block, ee_len)) {
+ newblock = map->m_lblk - ee_block + ee_start;
/* number of remaining blocks in the extent */
- allocated = ee_len - (iblock - ee_block);
- ext_debug("%u fit into %u:%d -> %llu\n", iblock,
- ee_block, ee_len, newblock);
+ allocated = ee_len - (map->m_lblk - ee_block);
+ ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+ ee_block, ee_len, newblock);
/* Do not put uninitialized extent in the cache */
if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
goto out;
}
ret = ext4_ext_handle_uninitialized_extents(handle,
- inode, iblock, max_blocks, path,
- flags, allocated, bh_result, newblock);
+ inode, map, path, flags, allocated,
+ newblock);
return ret;
}
}
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* put just found gap into cache to speed up
* subsequent requests
*/
- ext4_ext_put_gap_in_cache(inode, path, iblock);
+ ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
goto out2;
}
/*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
/* find neighbour allocated blocks */
- ar.lleft = iblock;
+ ar.lleft = map->m_lblk;
err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
if (err)
goto out2;
- ar.lright = iblock;
+ ar.lright = map->m_lblk;
err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
if (err)
goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
* EXT_UNINIT_MAX_LEN.
*/
- if (max_blocks > EXT_INIT_MAX_LEN &&
+ if (map->m_len > EXT_INIT_MAX_LEN &&
!(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
- max_blocks = EXT_INIT_MAX_LEN;
- else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+ map->m_len = EXT_INIT_MAX_LEN;
+ else if (map->m_len > EXT_UNINIT_MAX_LEN &&
(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
- max_blocks = EXT_UNINIT_MAX_LEN;
+ map->m_len = EXT_UNINIT_MAX_LEN;
- /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
- newex.ee_block = cpu_to_le32(iblock);
- newex.ee_len = cpu_to_le16(max_blocks);
+ /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+ newex.ee_block = cpu_to_le32(map->m_lblk);
+ newex.ee_len = cpu_to_le16(map->m_len);
err = ext4_ext_check_overlap(inode, &newex, path);
if (err)
allocated = ext4_ext_get_actual_len(&newex);
else
- allocated = max_blocks;
+ allocated = map->m_len;
/* allocate new block */
ar.inode = inode;
- ar.goal = ext4_ext_find_goal(inode, path, iblock);
- ar.logical = iblock;
+ ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+ ar.logical = map->m_lblk;
ar.len = allocated;
if (S_ISREG(inode->i_mode))
ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
EXT4_STATE_DIO_UNWRITTEN);
}
if (ext4_should_dioread_nolock(inode))
- set_buffer_uninit(bh_result);
+ map->m_flags |= EXT4_MAP_UNINIT;
}
- if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+ if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
if (unlikely(!eh->eh_entries)) {
EXT4_ERROR_INODE(inode,
- "eh->eh_entries == 0 ee_block %d",
- ex->ee_block);
+ "eh->eh_entries == 0 and "
+ "EOFBLOCKS_FL set");
err = -EIO;
goto out2;
}
last_ex = EXT_LAST_EXTENT(eh);
- if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
- + ext4_ext_get_actual_len(last_ex))
- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+ /*
+ * If the current leaf block was reached by looking at
+ * the last index block all the way down the tree, and
+ * we are extending the inode beyond the last extent
+ * in the current leaf block, then clear the
+ * EOFBLOCKS_FL flag.
+ */
+ for (i = depth-1; i >= 0; i--) {
+ if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+ break;
+ }
+ if ((i < 0) &&
+ (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
+ ext4_ext_get_actual_len(last_ex)))
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
- if (allocated > max_blocks)
- allocated = max_blocks;
- set_buffer_new(bh_result);
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_flags |= EXT4_MAP_NEW;
/*
* Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
* when it is _not_ an uninitialized extent.
*/
if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
- ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+ ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
EXT4_EXT_CACHE_EXTENT);
ext4_update_inode_fsync_trans(handle, inode, 1);
} else
ext4_update_inode_fsync_trans(handle, inode, 0);
out:
- if (allocated > max_blocks)
- allocated = max_blocks;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
- set_buffer_mapped(bh_result);
- bh_result->b_bdev = inode->i_sb->s_bdev;
- bh_result->b_blocknr = newblock;
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
+ map->m_len = allocated;
out2:
if (path) {
ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
* can proceed even if the new size is the same as i_size.
*/
if (new_size > i_size_read(inode))
- EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
}
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
{
handle_t *handle;
- ext4_lblk_t block;
loff_t new_size;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
int retries = 0;
- struct buffer_head map_bh;
+ struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
/*
* currently supporting (pre)allocate mode for extent-based
* files _only_
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
/* preallocation to directories is currently not supported */
if (S_ISDIR(inode->i_mode))
return -ENODEV;
- block = offset >> blkbits;
+ map.m_lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
* If blocksize = 4096 offset = 3072 and len = 2048
*/
max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- - block;
+ - map.m_lblk;
/*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, max_blocks);
mutex_lock(&inode->i_mutex);
+ ret = inode_newsize_ok(inode, (len + offset));
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
retry:
while (ret >= 0 && ret < max_blocks) {
- block = block + ret;
- max_blocks = max_blocks - ret;
+ map.m_lblk = map.m_lblk + ret;
+ map.m_len = max_blocks = max_blocks - ret;
handle = ext4_journal_start(inode, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
}
- map_bh.b_state = 0;
- ret = ext4_get_blocks(handle, inode, block,
- max_blocks, &map_bh,
+ ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_get_blocks "
+ printk(KERN_ERR "%s: ext4_ext_map_blocks "
"returned error inode#%lu, block=%u, "
"max_blocks=%u", __func__,
inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
ret2 = ext4_journal_stop(handle);
break;
}
- if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+ if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
blkbits) >> blkbits))
new_size = offset + len;
else
- new_size = (block + ret) << blkbits;
+ new_size = (map.m_lblk + ret) << blkbits;
ext4_falloc_update_inode(inode, mode, new_size,
- buffer_new(&map_bh));
+ (map.m_flags & EXT4_MAP_NEW));
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
ssize_t len)
{
handle_t *handle;
- ext4_lblk_t block;
unsigned int max_blocks;
int ret = 0;
int ret2 = 0;
- struct buffer_head map_bh;
+ struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
- block = offset >> blkbits;
+ map.m_lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
* If blocksize = 4096 offset = 3072 and len = 2048
*/
- max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- - block;
+ max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+ map.m_lblk);
/*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, max_blocks);
while (ret >= 0 && ret < max_blocks) {
- block = block + ret;
- max_blocks = max_blocks - ret;
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
handle = ext4_journal_start(inode, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
break;
}
- map_bh.b_state = 0;
- ret = ext4_get_blocks(handle, inode, block,
- max_blocks, &map_bh,
+ ret = ext4_map_blocks(handle, inode, &map,
EXT4_GET_BLOCKS_IO_CONVERT_EXT);
if (ret <= 0) {
WARN_ON(ret <= 0);
- printk(KERN_ERR "%s: ext4_ext_get_blocks "
+ printk(KERN_ERR "%s: ext4_ext_map_blocks "
"returned error inode#%lu, block=%u, "
"max_blocks=%u", __func__,
- inode->i_ino, block, max_blocks);
+ inode->i_ino, map.m_lblk, map.m_len);
}
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int error = 0;
/* fallback to generic here if not in extents fmt */
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len,
ext4_get_block);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
* is smaller than s_maxbytes, which is for extent-mapped files.
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
size_t length = iov_length(iov, nr_segs);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ef3d980e67cb..b6a74f991bf4 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
#include <trace/events/ext4.h>
/*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file. This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+ struct dentry *dentry = NULL;
+
+ while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+ dentry = list_entry(inode->i_dentry.next,
+ struct dentry, d_alias);
+ if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+ break;
+ inode = dentry->d_parent->d_inode;
+ sync_mapping_buffers(inode->i_mapping);
+ }
+}
+
+/*
* akpm: A new design for ext4_sync_file().
*
* This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
ret = flush_completed_IO(inode);
if (ret < 0)
return ret;
-
- if (!journal)
- return simple_fsync(file, dentry, datasync);
+
+ if (!journal) {
+ ret = simple_fsync(file, dentry, datasync);
+ if (!ret && !list_empty(&inode->i_dentry))
+ ext4_sync_parent(inode);
+ return ret;
+ }
/*
* data=writeback,ordered:
@@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
NULL, BLKDEV_IFL_WAIT);
- jbd2_log_wait_commit(journal, commit_tid);
+ ret = jbd2_log_wait_commit(journal, commit_tid);
} else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
BLKDEV_IFL_WAIT);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1a0e183a2f04..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (fatal)
goto error_return;
- /* Ok, now we can actually update the inode bitmaps.. */
- cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
- bit, bitmap_bh->b_data);
- if (!cleared)
- ext4_error(sb, "bit already cleared for inode %lu", ino);
- else {
- gdp = ext4_get_group_desc(sb, block_group, &bh2);
-
+ fatal = -ESRCH;
+ gdp = ext4_get_group_desc(sb, block_group, &bh2);
+ if (gdp) {
BUFFER_TRACE(bh2, "get_write_access");
fatal = ext4_journal_get_write_access(handle, bh2);
- if (fatal) goto error_return;
-
- if (gdp) {
- ext4_lock_group(sb, block_group);
- count = ext4_free_inodes_count(sb, gdp) + 1;
- ext4_free_inodes_set(sb, gdp, count);
- if (is_directory) {
- count = ext4_used_dirs_count(sb, gdp) - 1;
- ext4_used_dirs_set(sb, gdp, count);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f;
-
- f = ext4_flex_group(sbi, block_group);
- atomic_dec(&sbi->s_flex_groups[f].used_dirs);
- }
+ }
+ ext4_lock_group(sb, block_group);
+ cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+ if (fatal || !cleared) {
+ ext4_unlock_group(sb, block_group);
+ goto out;
+ }
- }
- gdp->bg_checksum = ext4_group_desc_csum(sbi,
- block_group, gdp);
- ext4_unlock_group(sb, block_group);
- percpu_counter_inc(&sbi->s_freeinodes_counter);
- if (is_directory)
- percpu_counter_dec(&sbi->s_dirs_counter);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f;
-
- f = ext4_flex_group(sbi, block_group);
- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
- }
- }
- BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, bh2);
- if (!fatal) fatal = err;
+ count = ext4_free_inodes_count(sb, gdp) + 1;
+ ext4_free_inodes_set(sb, gdp, count);
+ if (is_directory) {
+ count = ext4_used_dirs_count(sb, gdp) - 1;
+ ext4_used_dirs_set(sb, gdp, count);
+ percpu_counter_dec(&sbi->s_dirs_counter);
}
- BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
- if (!fatal)
- fatal = err;
- sb->s_dirt = 1;
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+ ext4_unlock_group(sb, block_group);
+
+ percpu_counter_inc(&sbi->s_freeinodes_counter);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ if (is_directory)
+ atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ }
+ BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+ fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+ if (cleared) {
+ BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+ if (!fatal)
+ fatal = err;
+ sb->s_dirt = 1;
+ } else
+ ext4_error(sb, "bit already cleared for inode %lu", ino);
+
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
if (S_ISDIR(mode) &&
((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+ (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
int best_ndir = inodes_per_group;
int ret = -1;
@@ -1041,7 +1034,7 @@ got:
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
/* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
- EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode);
}
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e0f6af9d08d..19df61c321fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
int ret;
/*
- * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+ * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
* i_mutex. So we can safely drop the i_data_sem here.
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
if (blk &&
unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
blk, 1))) {
- __ext4_error(inode->i_sb, function,
- "invalid block reference %u "
- "in inode #%lu", blk, inode->i_ino);
+ ext4_error_inode(function, inode,
+ "invalid block reference %u", blk);
return -EIO;
}
}
@@ -785,7 +784,7 @@ failed:
/* Allocation failed, free what we already allocated */
ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
for (i = 1; i <= n ; i++) {
- /*
+ /*
* branch[i].bh is newly allocated, so there is no
* need to revoke the block, which is why we don't
* need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
err_out:
for (i = 1; i <= num; i++) {
- /*
+ /*
* branch[i].bh is newly allocated, so there is no
* need to revoke the block, which is why we don't
* need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +889,9 @@ err_out:
}
/*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
* (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
*
* Allocation strategy is simple: if we have to allocate something, we will
* have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +916,8 @@ err_out:
* down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
* blocks.
*/
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
- ext4_lblk_t iblock, unsigned int maxblocks,
- struct buffer_head *bh_result,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
int flags)
{
int err = -EIO;
@@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
int count = 0;
ext4_fsblk_t first_block = 0;
- J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+ J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
- depth = ext4_block_to_path(inode, iblock, offsets,
+ depth = ext4_block_to_path(inode, map->m_lblk, offsets,
&blocks_to_boundary);
if (depth == 0)
@@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
/* Simplest case - block found, no allocation needed */
if (!partial) {
first_block = le32_to_cpu(chain[depth - 1].key);
- clear_buffer_new(bh_result);
count++;
/*map more blocks*/
- while (count < maxblocks && count <= blocks_to_boundary) {
+ while (count < map->m_len && count <= blocks_to_boundary) {
ext4_fsblk_t blk;
blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
/*
* Okay, we need to do block allocation.
*/
- goal = ext4_find_goal(inode, iblock, partial);
+ goal = ext4_find_goal(inode, map->m_lblk, partial);
/* the number of blocks need to allocate for [d,t]indirect blocks */
indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
* direct blocks to allocate for this branch.
*/
count = ext4_blks_to_allocate(partial, indirect_blks,
- maxblocks, blocks_to_boundary);
+ map->m_len, blocks_to_boundary);
/*
* Block out ext4_truncate while we alter the tree
*/
- err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+ err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
&count, goal,
offsets + (partial - chain), partial);
@@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
* may need to return -EAGAIN upwards in the worst case. --sct
*/
if (!err)
- err = ext4_splice_branch(handle, inode, iblock,
+ err = ext4_splice_branch(handle, inode, map->m_lblk,
partial, indirect_blks, count);
if (err)
goto cleanup;
- set_buffer_new(bh_result);
+ map->m_flags |= EXT4_MAP_NEW;
ext4_update_inode_fsync_trans(handle, inode, 1);
got_it:
- map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+ map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = le32_to_cpu(chain[depth-1].key);
+ map->m_len = count;
if (count > blocks_to_boundary)
- set_buffer_boundary(bh_result);
+ map->m_flags |= EXT4_MAP_BOUNDARY;
err = count;
/* Clean up and exit */
partial = chain + depth - 1; /* the whole chain */
@@ -1016,7 +1015,6 @@ cleanup:
brelse(partial->bh);
partial--;
}
- BUFFER_TRACE(bh_result, "returned");
out:
return err;
}
@@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
*/
static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
{
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return ext4_ext_calc_metadata_amount(inode, lblock);
return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- int mdb_free = 0, allocated_meta_blocks = 0;
spin_lock(&ei->i_block_reservation_lock);
trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
/* Update per-inode reservations */
ei->i_reserved_data_blocks -= used;
- used += ei->i_allocated_meta_blocks;
ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
- allocated_meta_blocks = ei->i_allocated_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ used + ei->i_allocated_meta_blocks);
ei->i_allocated_meta_blocks = 0;
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
if (ei->i_reserved_data_blocks == 0) {
/*
@@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
* only when we have written all of the delayed
* allocation blocks.
*/
- mdb_free = ei->i_reserved_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
- percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
}
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
- /* Update quota subsystem */
- if (quota_claim) {
+ /* Update quota subsystem for data blocks */
+ if (quota_claim)
dquot_claim_block(inode, used);
- if (mdb_free)
- dquot_release_reservation_block(inode, mdb_free);
- } else {
+ else {
/*
* We did fallocate with an offset that is already delayed
* allocated. So on delayed allocated writeback we should
- * not update the quota for allocated blocks. But then
- * converting an fallocate region to initialized region would
- * have caused a metadata allocation. So claim quota for
- * that
+ * not re-claim the quota for fallocated blocks.
*/
- if (allocated_meta_blocks)
- dquot_claim_block(inode, allocated_meta_blocks);
- dquot_release_reservation_block(inode, mdb_free + used);
+ dquot_release_reservation_block(inode, used);
}
/*
@@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
ext4_discard_preallocations(inode);
}
-static int check_block_validity(struct inode *inode, const char *msg,
- sector_t logical, sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *func,
+ struct ext4_map_blocks *map)
{
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- __ext4_error(inode->i_sb, msg,
- "inode #%lu logical block %llu mapped to %llu "
- "(size %d)", inode->i_ino,
- (unsigned long long) logical,
- (unsigned long long) phys, len);
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+ map->m_len)) {
+ ext4_error_inode(func, inode,
+ "lblock %lu mapped to illegal pblock %llu "
+ "(length %d)", (unsigned long) map->m_lblk,
+ map->m_pblk, map->m_len);
return -EIO;
}
return 0;
@@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
}
/*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
* Otherwise it takes the write lock of the i_data_sem and allocate blocks
* and store the allocated blocks in the result buffer head and mark it
* mapped.
*
- * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
* On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
*
* It returns the error in case of allocation failure.
*/
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
- unsigned int max_blocks, struct buffer_head *bh,
- int flags)
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
{
int retval;
- clear_buffer_mapped(bh);
- clear_buffer_unwritten(bh);
-
- ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
- "logical block %lu\n", inode->i_ino, flags, max_blocks,
- (unsigned long)block);
+ map->m_flags = 0;
+ ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+ "logical block %lu\n", inode->i_ino, flags, map->m_len,
+ (unsigned long) map->m_lblk);
/*
* Try to see if we can get the block without requesting a new
* file system block.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, 0);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, 0);
} else {
- retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
- bh, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, 0);
}
up_read((&EXT4_I(inode)->i_data_sem));
- if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, "file system corruption",
- block, bh->b_blocknr, retval);
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret = check_block_validity(inode, __func__, map);
if (ret != 0)
return ret;
}
@@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* ext4_ext_get_block() returns th create = 0
* with buffer head unmapped.
*/
- if (retval > 0 && buffer_mapped(bh))
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
return retval;
/*
@@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* of BH_Unwritten and BH_Mapped flags being simultaneously
* set on the buffer_head.
*/
- clear_buffer_unwritten(bh);
+ map->m_flags &= ~EXT4_MAP_UNWRITTEN;
/*
* New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
*/
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
- bh, flags);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
} else {
- retval = ext4_ind_get_blocks(handle, inode, block,
- max_blocks, bh, flags);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
- if (retval > 0 && buffer_new(bh)) {
+ if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
/*
* We allocated new blocks which will result in
* i_data's format changing. Force the migrate
@@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
EXT4_I(inode)->i_delalloc_reserved_flag = 0;
up_write((&EXT4_I(inode)->i_data_sem));
- if (retval > 0 && buffer_mapped(bh)) {
- int ret = check_block_validity(inode, "file system "
- "corruption after allocation",
- block, bh->b_blocknr, retval);
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret = check_block_validity(inode,
+ "ext4_map_blocks_after_alloc",
+ map);
if (ret != 0)
return ret;
}
@@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
-int ext4_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int flags)
{
handle_t *handle = ext4_journal_current_handle();
+ struct ext4_map_blocks map;
int ret = 0, started = 0;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
int dio_credits;
- if (create && !handle) {
+ map.m_lblk = iblock;
+ map.m_len = bh->b_size >> inode->i_blkbits;
+
+ if (flags && !handle) {
/* Direct IO write... */
- if (max_blocks > DIO_MAX_BLOCKS)
- max_blocks = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ if (map.m_len > DIO_MAX_BLOCKS)
+ map.m_len = DIO_MAX_BLOCKS;
+ dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
handle = ext4_journal_start(inode, dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out;
+ return ret;
}
started = 1;
}
- ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
- create ? EXT4_GET_BLOCKS_CREATE : 0);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ bh->b_size = inode->i_sb->s_blocksize * map.m_len;
ret = 0;
}
if (started)
ext4_journal_stop(handle);
-out:
return ret;
}
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ return _ext4_get_block(inode, iblock, bh,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
/*
* `handle' can be NULL if create is zero
*/
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int create, int *errp)
{
- struct buffer_head dummy;
+ struct ext4_map_blocks map;
+ struct buffer_head *bh;
int fatal = 0, err;
- int flags = 0;
J_ASSERT(handle != NULL || create == 0);
- dummy.b_state = 0;
- dummy.b_blocknr = -1000;
- buffer_trace_init(&dummy.b_history);
- if (create)
- flags |= EXT4_GET_BLOCKS_CREATE;
- err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
- /*
- * ext4_get_blocks() returns number of blocks mapped. 0 in
- * case of a HOLE.
- */
- if (err > 0) {
- if (err > 1)
- WARN_ON(1);
- err = 0;
+ map.m_lblk = block;
+ map.m_len = 1;
+ err = ext4_map_blocks(handle, inode, &map,
+ create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+ if (err < 0)
+ *errp = err;
+ if (err <= 0)
+ return NULL;
+ *errp = 0;
+
+ bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (!bh) {
+ *errp = -EIO;
+ return NULL;
}
- *errp = err;
- if (!err && buffer_mapped(&dummy)) {
- struct buffer_head *bh;
- bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
- if (!bh) {
- *errp = -EIO;
- goto err;
- }
- if (buffer_new(&dummy)) {
- J_ASSERT(create != 0);
- J_ASSERT(handle != NULL);
+ if (map.m_flags & EXT4_MAP_NEW) {
+ J_ASSERT(create != 0);
+ J_ASSERT(handle != NULL);
- /*
- * Now that we do not always journal data, we should
- * keep in mind whether this should always journal the
- * new buffer as metadata. For now, regular file
- * writes use ext4_get_block instead, so it's not a
- * problem.
- */
- lock_buffer(bh);
- BUFFER_TRACE(bh, "call get_create_access");
- fatal = ext4_journal_get_create_access(handle, bh);
- if (!fatal && !buffer_uptodate(bh)) {
- memset(bh->b_data, 0, inode->i_sb->s_blocksize);
- set_buffer_uptodate(bh);
- }
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, inode, bh);
- if (!fatal)
- fatal = err;
- } else {
- BUFFER_TRACE(bh, "not a new buffer");
- }
- if (fatal) {
- *errp = fatal;
- brelse(bh);
- bh = NULL;
+ /*
+ * Now that we do not always journal data, we should
+ * keep in mind whether this should always journal the
+ * new buffer as metadata. For now, regular file
+ * writes use ext4_get_block instead, so it's not a
+ * problem.
+ */
+ lock_buffer(bh);
+ BUFFER_TRACE(bh, "call get_create_access");
+ fatal = ext4_journal_get_create_access(handle, bh);
+ if (!fatal && !buffer_uptodate(bh)) {
+ memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+ set_buffer_uptodate(bh);
}
- return bh;
+ unlock_buffer(bh);
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!fatal)
+ fatal = err;
+ } else {
+ BUFFER_TRACE(bh, "not a new buffer");
}
-err:
- return NULL;
+ if (fatal) {
+ *errp = fatal;
+ brelse(bh);
+ bh = NULL;
+ }
+ return bh;
}
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
- unsigned long md_needed, md_reserved;
+ unsigned long md_needed;
int ret;
/*
@@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
*/
repeat:
spin_lock(&ei->i_block_reservation_lock);
- md_reserved = ei->i_reserved_meta_blocks;
md_needed = ext4_calc_metadata_amount(inode, lblock);
trace_ext4_da_reserve_space(inode, md_needed);
spin_unlock(&ei->i_block_reservation_lock);
/*
- * Make quota reservation here to prevent quota overflow
- * later. Real quota accounting is done at pages writeout
- * time.
+ * We will charge metadata quota at writeout time; this saves
+ * us from metadata over-estimation, though we may go over by
+ * a small amount in the end. Here we just reserve for data.
*/
- ret = dquot_reserve_block(inode, md_needed + 1);
+ ret = dquot_reserve_block(inode, 1);
if (ret)
return ret;
-
+ /*
+ * We do still charge estimated metadata to the sb though;
+ * we cannot afford to run out of free blocks.
+ */
if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
- dquot_release_reservation_block(inode, md_needed + 1);
+ dquot_release_reservation_block(inode, 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
@@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ trace_ext4_da_release_space(inode, to_free);
if (unlikely(to_free > ei->i_reserved_data_blocks)) {
/*
* if there aren't enough reserved blocks, then the
@@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
* only when we have written all of the delayed
* allocation blocks.
*/
- to_free += ei->i_reserved_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ ei->i_reserved_meta_blocks);
ei->i_reserved_meta_blocks = 0;
ei->i_da_metadata_calc_len = 0;
}
- /* update fs dirty blocks counter */
+ /* update fs dirty data blocks counter */
percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
/*
* mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
*
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
* the function goes through all passed space and put actual disk
* block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
*/
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
- struct buffer_head *exbh)
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
+ struct ext4_map_blocks *map)
{
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- int blocks = exbh->b_size >> inode->i_blkbits;
- sector_t pblock = exbh->b_blocknr, cur_logical;
+ int blocks = map->m_len;
+ sector_t pblock = map->m_pblk, cur_logical;
struct buffer_head *head, *bh;
pgoff_t index, end;
struct pagevec pvec;
int nr_pages, i;
- index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
- end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
pagevec_init(&pvec, 0);
@@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
/* skip blocks out of the range */
do {
- if (cur_logical >= logical)
+ if (cur_logical >= map->m_lblk)
break;
cur_logical++;
} while ((bh = bh->b_this_page) != head);
do {
- if (cur_logical >= logical + blocks)
+ if (cur_logical >= map->m_lblk + blocks)
break;
- if (buffer_delay(bh) ||
- buffer_unwritten(bh)) {
+ if (buffer_delay(bh) || buffer_unwritten(bh)) {
BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
@@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
} else if (buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
- if (buffer_uninit(exbh))
+ if (map->m_flags & EXT4_MAP_UNINIT)
set_buffer_uninit(bh);
cur_logical++;
pblock++;
@@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
}
-/*
- * __unmap_underlying_blocks - just a helper function to unmap
- * set of blocks described by @bh
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
- struct buffer_head *bh)
-{
- struct block_device *bdev = inode->i_sb->s_bdev;
- int blocks, i;
-
- blocks = bh->b_size >> inode->i_blkbits;
- for (i = 0; i < blocks; i++)
- unmap_underlying_metadata(bdev, bh->b_blocknr + i);
-}
-
static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
sector_t logical, long blk_cnt)
{
@@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
static int mpage_da_map_blocks(struct mpage_da_data *mpd)
{
int err, blks, get_blocks_flags;
- struct buffer_head new;
+ struct ext4_map_blocks map;
sector_t next = mpd->b_blocknr;
unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
* variables are updated after the blocks have been allocated.
*/
- new.b_state = 0;
+ map.m_lblk = next;
+ map.m_len = max_blocks;
get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
if (ext4_should_dioread_nolock(mpd->inode))
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (mpd->b_state & (1 << BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
- blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
- &new, get_blocks_flags);
+ blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
if (blks < 0) {
err = blks;
/*
@@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
ext4_msg(mpd->inode->i_sb, KERN_CRIT,
"delayed block allocation failed for inode %lu at "
"logical offset %llu with max blocks %zd with "
- "error %d\n", mpd->inode->i_ino,
+ "error %d", mpd->inode->i_ino,
(unsigned long long) next,
mpd->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_CRIT "This should not happen!! "
@@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
}
BUG_ON(blks == 0);
- new.b_size = (blks << mpd->inode->i_blkbits);
+ if (map.m_flags & EXT4_MAP_NEW) {
+ struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+ int i;
- if (buffer_new(&new))
- __unmap_underlying_blocks(mpd->inode, &new);
+ for (i = 0; i < map.m_len; i++)
+ unmap_underlying_metadata(bdev, map.m_pblk + i);
+ }
/*
* If blocks are delayed marked, we need to
@@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
*/
if ((mpd->b_state & (1 << BH_Delay)) ||
(mpd->b_state & (1 << BH_Unwritten)))
- mpage_put_bnr_to_bhs(mpd, next, &new);
+ mpage_put_bnr_to_bhs(mpd, &map);
if (ext4_should_order_data(mpd->inode)) {
err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
sector_t next;
int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+ /*
+ * XXX Don't go larger than mballoc is willing to allocate
+ * This is a stopgap solution. We eventually need to fold
+ * mpage_da_submit_io() into this function and then call
+ * ext4_get_blocks() multiple times in a loop
+ */
+ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+ goto flush_it;
+
/* check if thereserved journal credits might overflow */
- if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
if (nrblocks >= EXT4_MAX_TRANS_DATA) {
/*
* With non-extent format we are limited by the journal
@@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
struct buffer_head *bh, *head;
sector_t logical;
- if (mpd->io_done) {
- /*
- * Rest of the page in the page_vec
- * redirty then and skip then. We will
- * try to write them again after
- * starting a new transaction
- */
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return MPAGE_DA_EXTENT_TAIL;
- }
/*
* Can we merge this page to current extent?
*/
@@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
* initialized properly.
*/
static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+ struct buffer_head *bh, int create)
{
+ struct ext4_map_blocks map;
int ret = 0;
sector_t invalid_block = ~((sector_t) 0xffff);
@@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
invalid_block = ~0;
BUG_ON(create == 0);
- BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+ BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+ map.m_lblk = iblock;
+ map.m_len = 1;
/*
* first, we need to know whether the block is allocated already
* preallocated blocks are unmapped but should treated
* the same as allocated blocks.
*/
- ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0);
- if ((ret == 0) && !buffer_delay(bh_result)) {
- /* the block isn't (pre)allocated yet, let's reserve space */
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ if (buffer_delay(bh))
+ return 0; /* Not sure this could or should happen */
/*
* XXX: __block_prepare_write() unmaps passed block,
* is it OK?
@@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
/* not enough space to reserve */
return ret;
- map_bh(bh_result, inode->i_sb, invalid_block);
- set_buffer_new(bh_result);
- set_buffer_delay(bh_result);
- } else if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- if (buffer_unwritten(bh_result)) {
- /* A delayed write to unwritten bh should
- * be marked new and mapped. Mapped ensures
- * that we don't do get_block multiple times
- * when we write to the same offset and new
- * ensures that we do proper zero out for
- * partial write.
- */
- set_buffer_new(bh_result);
- set_buffer_mapped(bh_result);
- }
- ret = 0;
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
+ return 0;
}
- return ret;
+ map_bh(bh, inode->i_sb, map.m_pblk);
+ bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+ if (buffer_unwritten(bh)) {
+ /* A delayed write to unwritten bh should be marked
+ * new and mapped. Mapped ensures that we don't do
+ * get_block multiple times when we write to the same
+ * offset and new ensures that we do proper zero out
+ * for partial write.
+ */
+ set_buffer_new(bh);
+ set_buffer_mapped(bh);
+ }
+ return 0;
}
/*
@@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- int ret = 0;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-
BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
-
- /*
- * we don't want to do block allocation in writepage
- * so call get_block_wrap with create = 0
- */
- ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
- if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- ret = 0;
- }
- return ret;
+ return _ext4_get_block(inode, iblock, bh_result, 0);
}
static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* number of contiguous block. So we will limit
* number of contiguous block to a sane value
*/
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
(max_blocks > EXT4_MAX_TRANS_DATA))
max_blocks = EXT4_MAX_TRANS_DATA;
return ext4_chunk_trans_blocks(inode, max_blocks);
}
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages(). Differences:
+ * Range cyclic is ignored.
+ * no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct mpage_da_data *mpd)
+{
+ int ret = 0;
+ int done = 0;
+ struct pagevec pvec;
+ int nr_pages;
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
+ long nr_to_write = wbc->nr_to_write;
+
+ pagevec_init(&pvec, 0);
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+ while (!done && (index <= end)) {
+ int i;
+
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end) {
+ done = 1;
+ break;
+ }
+
+ lock_page(page);
+
+ /*
+ * Page truncated or invalidated. We can freely skip it
+ * then, even for data integrity operations: the page
+ * has disappeared concurrently, so there could be no
+ * real expectation of this data interity operation
+ * even if there is now a new, dirty page at the same
+ * pagecache address.
+ */
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+ else
+ goto continue_unlock;
+ }
+
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = __mpage_da_writepage(page, wbc, mpd);
+ if (unlikely(ret)) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ } else {
+ done = 1;
+ break;
+ }
+ }
+
+ if (nr_to_write > 0) {
+ nr_to_write--;
+ if (nr_to_write == 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ /*
+ * We stop writing back only if we are
+ * not doing integrity sync. In case of
+ * integrity sync we have to keep going
+ * because someone may be concurrently
+ * dirtying pages, and we might have
+ * synced a lot of newly appeared dirty
+ * pages, but have not synced all of the
+ * old dirty pages.
+ */
+ done = 1;
+ break;
+ }
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ return ret;
+}
+
+
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
handle_t *handle = NULL;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
- int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
unsigned int max_pages;
@@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
mpd.wbc = wbc;
mpd.inode = mapping->host;
- /*
- * we don't want write_cache_pages to update
- * nr_to_write and writeback_index
- */
- no_nrwrite_index_update = wbc->no_nrwrite_index_update;
- wbc->no_nrwrite_index_update = 1;
pages_skipped = wbc->pages_skipped;
retry:
@@ -2941,7 +3011,7 @@ retry:
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
- "%ld pages, ino %lu; err %d\n", __func__,
+ "%ld pages, ino %lu; err %d", __func__,
wbc->nr_to_write, inode->i_ino, ret);
goto out_writepages;
}
@@ -2963,8 +3033,7 @@ retry:
mpd.io_done = 0;
mpd.pages_written = 0;
mpd.retval = 0;
- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
- &mpd);
+ ret = write_cache_pages_da(mapping, wbc, &mpd);
/*
* If we have a contiguous extent of pages and we
* haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3085,7 @@ retry:
if (pages_skipped != wbc->pages_skipped)
ext4_msg(inode->i_sb, KERN_CRIT,
"This should not happen leaving %s "
- "with nr_to_write = %ld ret = %d\n",
+ "with nr_to_write = %ld ret = %d",
__func__, wbc->nr_to_write, ret);
/* Update index */
@@ -3030,8 +3099,6 @@ retry:
mapping->writeback_index = index;
out_writepages:
- if (!no_nrwrite_index_update)
- wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
wbc->range_start = range_start;
trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- int ret, retries = 0, quota_retries = 0;
+ int ret, retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
@@ -3135,22 +3202,6 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
-
- if ((ret == -EDQUOT) &&
- EXT4_I(inode)->i_reserved_meta_blocks &&
- (quota_retries++ < 3)) {
- /*
- * Since we often over-estimate the number of meta
- * data blocks required, we may sometimes get a
- * spurios out of quota error even though there would
- * be enough space once we write the data blocks and
- * find out how many meta data blocks were _really_
- * required. So try forcing the inode write to see if
- * that helps.
- */
- write_inode_now(inode, (quota_retries == 3));
- goto retry;
- }
out:
return ret;
}
@@ -3546,46 +3597,18 @@ out:
return ret;
}
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
static int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
- handle_t *handle = ext4_journal_current_handle();
- int ret = 0;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
- int dio_credits;
- int started = 0;
-
ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
inode->i_ino, create);
- /*
- * ext4_get_block in prepare for a DIO write or buffer write.
- * We allocate an uinitialized extent if blocks haven't been allocated.
- * The extent will be converted to initialized after IO complete.
- */
- create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
-
- if (!handle) {
- if (max_blocks > DIO_MAX_BLOCKS)
- max_blocks = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
- handle = ext4_journal_start(inode, dio_credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
- started = 1;
- }
-
- ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
- create);
- if (ret > 0) {
- bh_result->b_size = (ret << inode->i_blkbits);
- ret = 0;
- }
- if (started)
- ext4_journal_stop(handle);
-out:
- return ret;
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
static void dump_completed_IO(struct inode * inode)
@@ -3973,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4302,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
count)) {
- ext4_error(inode->i_sb, "inode #%lu: "
- "attempt to clear blocks %llu len %lu, invalid",
- inode->i_ino, (unsigned long long) block_to_free,
- count);
+ EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+ "blocks %llu len %lu",
+ (unsigned long long) block_to_free, count);
return 1;
}
@@ -4410,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
ext4_handle_dirty_metadata(handle, inode, this_bh);
else
- ext4_error(inode->i_sb,
- "circular indirect block detected, "
- "inode=%lu, block=%llu",
- inode->i_ino,
- (unsigned long long) this_bh->b_blocknr);
+ EXT4_ERROR_INODE(inode,
+ "circular indirect block detected at "
+ "block %llu",
+ (unsigned long long) this_bh->b_blocknr);
}
}
@@ -4452,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
nr, 1)) {
- ext4_error(inode->i_sb,
- "indirect mapped block in inode "
- "#%lu invalid (level %d, blk #%lu)",
- inode->i_ino, depth,
- (unsigned long) nr);
+ EXT4_ERROR_INODE(inode,
+ "invalid indirect mapped "
+ "block %lu (level %d)",
+ (unsigned long) nr, depth);
break;
}
@@ -4468,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
* (should be rare).
*/
if (!bh) {
- ext4_error(inode->i_sb,
- "Read failure, inode=%lu, block=%llu",
- inode->i_ino, nr);
+ EXT4_ERROR_INODE(inode,
+ "Read failure block=%llu",
+ (unsigned long long) nr);
continue;
}
@@ -4612,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
ext4_ext_truncate(inode);
return;
}
@@ -4785,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
bh = sb_getblk(sb, block);
if (!bh) {
- ext4_error(sb, "unable to read inode block - "
- "inode=%lu, block=%llu", inode->i_ino, block);
+ EXT4_ERROR_INODE(inode, "unable to read inode block - "
+ "block %llu", block);
return -EIO;
}
if (!buffer_uptodate(bh)) {
@@ -4884,8 +4904,8 @@ make_io:
submit_bh(READ_META, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_error(sb, "unable to read inode block - inode=%lu,"
- " block=%llu", inode->i_ino, block);
+ EXT4_ERROR_INODE(inode, "unable to read inode "
+ "block %llu", block);
brelse(bh);
return -EIO;
}
@@ -5096,8 +5116,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ret = 0;
if (ei->i_file_acl &&
!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
- ext4_error(sb, "bad extended attribute block %llu inode #%lu",
- ei->i_file_acl, inode->i_ino);
+ EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+ ei->i_file_acl);
ret = -EIO;
goto bad_inode;
} else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5142,8 +5162,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
} else {
ret = -EIO;
- ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
- inode->i_mode, inode->i_ino);
+ EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
brelse(iloc.bh);
@@ -5381,9 +5400,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (wbc->sync_mode == WB_SYNC_ALL)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
- ext4_error(inode->i_sb, "IO error syncing inode, "
- "inode=%lu, block=%llu", inode->i_ino,
- (unsigned long long)iloc.bh->b_blocknr);
+ EXT4_ERROR_INODE(inode,
+ "IO error syncing inode (block=%llu)",
+ (unsigned long long) iloc.bh->b_blocknr);
err = -EIO;
}
brelse(iloc.bh);
@@ -5455,7 +5474,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
if (attr->ia_valid & ATTR_SIZE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5468,7 +5487,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE &&
(attr->ia_size < inode->i_size ||
- (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+ (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
handle_t *handle;
handle = ext4_journal_start(inode, 3);
@@ -5500,7 +5519,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
}
}
/* ext4_truncate will clear the flag */
- if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+ if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
ext4_truncate(inode);
}
@@ -5576,7 +5595,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
{
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
}
@@ -5911,9 +5930,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
*/
if (val)
- EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+ ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
else
- EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
if (me.moved_len > 0)
file_remove_suid(donor_filp);
- if (copy_to_user((struct move_extent __user *)arg,
+ if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
err = -EFAULT;
mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC32_SETRSVSZ:
cmd = EXT4_IOC_SETRSVSZ;
break;
- case EXT4_IOC_GROUP_ADD:
+ case EXT4_IOC32_GROUP_ADD: {
+ struct compat_ext4_new_group_input __user *uinput;
+ struct ext4_new_group_input input;
+ mm_segment_t old_fs;
+ int err;
+
+ uinput = compat_ptr(arg);
+ err = get_user(input.group, &uinput->group);
+ err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+ err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+ err |= get_user(input.inode_table, &uinput->inode_table);
+ err |= get_user(input.blocks_count, &uinput->blocks_count);
+ err |= get_user(input.reserved_blocks,
+ &uinput->reserved_blocks);
+ if (err)
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+ (unsigned long) &input);
+ set_fs(old_fs);
+ return err;
+ }
+ case EXT4_IOC_MOVE_EXT:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
}
}
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+ int i;
+ int bits;
+
+ grp->bb_largest_free_order = -1; /* uninit */
+
+ bits = sb->s_blocksize_bits + 1;
+ for (i = bits; i >= 0; i--) {
+ if (grp->bb_counters[i] > 0) {
+ grp->bb_largest_free_order = i;
+ break;
+ }
+ }
+}
+
static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
*/
grp->bb_free = free;
}
+ mb_set_largest_free_order(sb, grp);
clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
* contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
* So it can have information regarding groups_per_page which
* is blocks_per_page/2
+ *
+ * Locking note: This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
*/
static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
BUG_ON(incore == NULL);
mb_debug(1, "put buddy for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
+ trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo = ext4_get_group_info(sb, group);
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
BUG_ON(incore != NULL);
mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
group, page->index, i * blocksize);
+ trace_ext4_mb_bitmap_load(sb, group);
/* see comments in ext4_mb_put_pa() */
ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
return err;
}
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
{
@@ -1004,6 +1036,11 @@ err:
return ret;
}
+/*
+ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
static noinline_for_stack int
ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
return ret;
}
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
{
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
@@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
buddy = buddy2;
} while (1);
}
+ mb_set_largest_free_order(sb, e4b->bd_info);
mb_check_buddy(e4b);
}
@@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
e4b->bd_info->bb_counters[ord]++;
e4b->bd_info->bb_counters[ord]++;
}
+ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
mb_check_buddy(e4b);
@@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
}
ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);
return 0;
}
@@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
ext4_unlock_group(ac->ac_sb, group);
- ext4_mb_release_desc(e4b);
+ ext4_mb_unload_buddy(e4b);
return 0;
}
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
}
+/* This is now called BEFORE we load the buddy bitmap. */
static int ext4_mb_good_group(struct ext4_allocation_context *ac,
ext4_group_t group, int cr)
{
unsigned free, fragments;
- unsigned i, bits;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
BUG_ON(cr < 0 || cr >= 4);
- BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ int ret = ext4_mb_init_group(ac->ac_sb, group);
+ if (ret)
+ return 0;
+ }
free = grp->bb_free;
fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
case 0:
BUG_ON(ac->ac_2order == 0);
+ if (grp->bb_largest_free_order < ac->ac_2order)
+ return 0;
+
/* Avoid using the first bg of a flexgroup for data files */
if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
(flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
((group % flex_size) == 0))
return 0;
- bits = ac->ac_sb->s_blocksize_bits + 1;
- for (i = ac->ac_2order; i <= bits; i++)
- if (grp->bb_counters[i] > 0)
- return 1;
- break;
+ return 1;
case 1:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
return 1;
@@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
sbi = EXT4_SB(sb);
ngroups = ext4_get_groups_count(sb);
/* non-extent files are limited to low blocks/groups */
- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
ngroups = sbi->s_blockfile_groups;
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2024,15 +2068,11 @@ repeat:
group = ac->ac_g_ex.fe_group;
for (i = 0; i < ngroups; group++, i++) {
- struct ext4_group_info *grp;
- struct ext4_group_desc *desc;
-
if (group == ngroups)
group = 0;
- /* quick check to skip empty groups */
- grp = ext4_get_group_info(sb, group);
- if (grp->bb_free == 0)
+ /* This now checks without needing the buddy page */
+ if (!ext4_mb_good_group(ac, group, cr))
continue;
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,15 +2080,18 @@ repeat:
goto out;
ext4_lock_group(sb, group);
+
+ /*
+ * We need to check again after locking the
+ * block group
+ */
if (!ext4_mb_good_group(ac, group, cr)) {
- /* someone did allocation from this group */
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
continue;
}
ac->ac_groups_scanned++;
- desc = ext4_get_group_desc(sb, group, NULL);
if (cr == 0)
ext4_mb_simple_scan_group(ac, &e4b);
else if (cr == 1 &&
@@ -2058,7 +2101,7 @@ repeat:
ext4_mb_complex_scan_group(ac, &e4b);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
if (ac->ac_status != AC_STATUS_CONTINUE)
break;
@@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
ext4_lock_group(sb, group);
memcpy(&sg, ext4_get_group_info(sb, group), i);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT;
+ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
#ifdef DOUBLE_CHECK
{
@@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
entry->count, entry->group, entry);
if (test_opt(sb, DISCARD)) {
+ int ret;
ext4_fsblk_t discard_block;
discard_block = entry->start_blk +
@@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
trace_ext4_discard_blocks(sb,
(unsigned long long)discard_block,
entry->count);
- sb_issue_discard(sb, discard_block, entry->count);
+ ret = sb_issue_discard(sb, discard_block, entry->count);
+ if (ret == EOPNOTSUPP) {
+ ext4_warning(sb,
+ "discard not supported, disabling");
+ clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+ }
}
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
}
ext4_unlock_group(sb, entry->group);
kmem_cache_free(ext4_free_ext_cachep, entry);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
}
mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
void exit_ext4_mballoc(void)
{
- /*
+ /*
* Wait for completion of call_rcu()'s on ext4_pspace_cachep
* before destroying the slab cache.
*/
@@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
atomic_inc(&sbi->s_bal_reqs);
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
- if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+ if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
atomic_inc(&sbi->s_bal_success);
atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
continue;
/* non-extent files can't have physical blocks past 2^32 */
- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
continue;
@@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
- /*
+ /*
* If doing group-based preallocation, pa_pstart may be in the
* next group when pa is used up
*/
@@ -3697,7 +3747,7 @@ out:
ext4_unlock_group(sb, group);
if (ac)
kmem_cache_free(ext4_ac_cachep, ac);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
return free;
}
@@ -3801,7 +3851,7 @@ repeat:
if (bitmap_bh == NULL) {
ext4_error(sb, "Error reading block bitmap for %u",
group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -3810,7 +3860,7 @@ repeat:
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
put_bh(bitmap_bh);
list_del(&pa->u.pa_tmp_list);
@@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
ext4_mb_release_group_pa(&e4b, pa, ac);
ext4_unlock_group(sb, group);
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
list_del(&pa->u.pa_tmp_list);
call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
}
@@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
if (!bh)
tbh = sb_find_get_block(inode->i_sb,
block + i);
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
inode, tbh, block + i);
}
}
- /*
+ /*
* We need to make sure we don't reuse the freed block until
* after the transaction is committed, which we can do by
* treating the block as metadata, below. We make an
@@ -4610,7 +4660,7 @@ do_more:
atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
}
- ext4_mb_release_desc(&e4b);
+ ext4_mb_unload_buddy(&e4b);
freed += count;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
*/
if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_INCOMPAT_EXTENTS) ||
- (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..3a6c92ac131c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
int depth = ext_depth(orig_inode);
int ret;
+ start_ext.ee_block = end_ext.ee_block = 0;
o_start = o_end = oext = orig_path[depth].p_ext;
oext_alen = ext4_ext_get_actual_len(oext);
start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
* new_ext |-------|
*/
if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
- ext4_error(orig_inode->i_sb,
+ EXT4_ERROR_INODE(orig_inode,
"new_ext_end(%u) should be less than or equal to "
"oext->ee_block(%u) + oext_alen(%d) - 1",
new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
while (1) {
/* The extent for donor must be found. */
if (!dext) {
- ext4_error(donor_inode->i_sb,
+ EXT4_ERROR_INODE(donor_inode,
"The extent for donor must be found");
*err = -EIO;
goto out;
} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
- ext4_error(donor_inode->i_sb,
+ EXT4_ERROR_INODE(donor_inode,
"Donor offset(%u) and the first block of donor "
"extent(%u) should be equal",
donor_off,
@@ -976,11 +977,11 @@ mext_check_arguments(struct inode *orig_inode,
}
/* Ext4 move extent supports only extent based file */
- if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: orig file is not extents "
"based file [ino:orig %lu]\n", orig_inode->i_ino);
return -EOPNOTSUPP;
- } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
ext4_debug("ext4 move extent: donor file is not extents "
"based file [ino:donor %lu]\n", donor_inode->i_ino);
return -EOPNOTSUPP;
@@ -1354,7 +1355,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
if (ret1 < 0)
break;
if (*moved_len > len) {
- ext4_error(orig_inode->i_sb,
+ EXT4_ERROR_INODE(orig_inode,
"We replaced blocks too much! "
"sum of replaced: %llu requested: %llu",
*moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
return blocksize;
return (len & 65532) | ((len & 3) << 16);
}
-
+
__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
if (len == blocksize) {
if (blocksize == 65536)
return cpu_to_le16(EXT4_MAX_REC_LEN);
- else
+ else
return cpu_to_le16(0);
}
return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
brelse(bh);
}
if (bcount)
- printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
+ printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
levels ? "" : " ", names, space/bcount,
(space/bcount)*100/blocksize);
return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
int ret, err;
__u32 hashval;
- dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
+ dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
start_hash, start_minor_hash));
dir = dir_file->f_path.dentry->d_inode;
- if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+ if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
if (hinfo.hash_version <= DX_HASH_TEA)
hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
{
if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX))
- EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}
/*
@@ -943,8 +943,8 @@ restart:
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
/* read error, skip block & hope for the best */
- ext4_error(sb, "reading directory #%lu offset %lu",
- dir->i_ino, (unsigned long)block);
+ EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+ (unsigned long) block);
brelse(bh);
goto next;
}
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
__u32 ino = le32_to_cpu(de->inode);
brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
- ext4_error(dir->i_sb, "bad inode number: %u", ino);
+ EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
return ERR_PTR(-EIO);
}
inode = ext4_iget(dir->i_sb, ino);
if (unlikely(IS_ERR(inode))) {
if (PTR_ERR(inode) == -ESTALE) {
- ext4_error(dir->i_sb,
- "deleted inode referenced: %u",
- ino);
+ EXT4_ERROR_INODE(dir,
+ "deleted inode referenced: %u",
+ ino);
return ERR_PTR(-EIO);
} else {
return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
brelse(bh);
if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
- ext4_error(child->d_inode->i_sb,
- "bad inode number: %u", ino);
+ EXT4_ERROR_INODE(child->d_inode,
+ "bad parent inode number: %u", ino);
return ERR_PTR(-EIO);
}
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
unsigned rec_len = 0;
while (count--) {
- struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
(from + (map->offs<<2));
rec_len = EXT4_DIR_REC_LEN(de->name_len);
memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
de = (struct ext4_dir_entry_2 *)((char *)fde +
ext4_rec_len_from_disk(fde->rec_len, blocksize));
if ((char *) de >= (((char *) root) + blocksize)) {
- ext4_error(dir->i_sb,
- "invalid rec_len for '..' in inode %lu",
- dir->i_ino);
+ EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
brelse(bh);
return -EIO;
}
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
brelse(bh);
return retval;
}
- EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+ ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
data1 = bh2->b_data;
memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
retval = ext4_dx_add_entry(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
return retval;
- EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
}
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
+ if (retval == 0)
+ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
return retval;
}
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
!(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
if (err)
- ext4_error(inode->i_sb,
- "error %d reading directory #%lu offset 0",
- err, inode->i_ino);
+ EXT4_ERROR_INODE(inode,
+ "error %d reading directory lblock 0", err);
else
ext4_warning(inode->i_sb,
"bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
de = ext4_next_entry(de1, sb->s_blocksize);
while (offset < inode->i_size) {
if (!bh ||
- (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ unsigned int lblock;
err = 0;
brelse(bh);
- bh = ext4_bread(NULL, inode,
- offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+ lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ bh = ext4_bread(NULL, inode, lblock, 0, &err);
if (!bh) {
if (err)
- ext4_error(sb,
- "error %d reading directory"
- " #%lu offset %u",
- err, inode->i_ino, offset);
+ EXT4_ERROR_INODE(inode,
+ "error %d reading directory "
+ "lblock %u", err, lblock);
offset += sb->s_blocksize;
continue;
}
@@ -2297,7 +2296,7 @@ retry:
}
} else {
/* clear the extent format for fast symlink */
- EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
inode->i_op = &ext4_fast_symlink_inode_operations;
memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb));
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+ sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, input->group);
atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d5..49d88c0597c4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);
+ vfs_check_frozen(sb, SB_FREEZE_WRITE);
/* Special case here: if the journal has aborted behind our
* backs (eg. EIO in the commit thread), then we still need to
* take the FS itself readonly cleanly. */
@@ -941,6 +942,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
seq_puts(seq, ",journal_async_commit");
+ else if (test_opt(sb, JOURNAL_CHECKSUM))
+ seq_puts(seq, ",journal_checksum");
if (test_opt(sb, NOBH))
seq_puts(seq, ",nobh");
if (test_opt(sb, I_VERSION))
@@ -2213,7 +2216,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
struct ext4_attr {
struct attribute attr;
ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
- ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+ ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
const char *, size_t);
int offset;
};
@@ -2430,6 +2433,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
{
+ char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh;
struct ext4_super_block *es = NULL;
struct ext4_sb_info *sbi;
@@ -2793,24 +2797,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
- if (!err) {
- err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext4_count_free_inodes(sb));
- }
- if (!err) {
- err = percpu_counter_init(&sbi->s_dirs_counter,
- ext4_count_dirs(sb));
- }
- if (!err) {
- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
- }
- if (err) {
- ext4_msg(sb, KERN_ERR, "insufficient memory");
- goto failed_mount3;
- }
-
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;
@@ -2910,6 +2896,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
no_journal:
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
+ ext4_count_free_blocks(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ if (!err)
+ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "insufficient memory");
+ goto failed_mount_wq;
+ }
if (test_opt(sb, NOBH)) {
if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3001,7 +3001,7 @@ no_journal:
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
- "zone (%d)\n", err);
+ "zone (%d)", err);
goto failed_mount4;
}
@@ -3040,9 +3040,11 @@ no_journal:
} else
descr = "out journal";
- ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
+ ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+ "Opts: %s", descr, orig_data);
lock_kernel();
+ kfree(orig_data);
return 0;
cantfind_ext4:
@@ -3059,6 +3061,10 @@ failed_mount_wq:
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
+ percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount3:
if (sbi->s_flex_groups) {
if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3072,6 @@ failed_mount3:
else
kfree(sbi->s_flex_groups);
}
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3091,7 @@ out_fail:
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
lock_kernel();
+ kfree(orig_data);
return ret;
}
@@ -3380,7 +3383,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
if (!(sb->s_flags & MS_RDONLY))
es->s_wtime = cpu_to_le32(get_seconds());
es->s_kbytes_written =
- cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
EXT4_SB(sb)->s_sectors_written_start) >> 1));
ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3485,8 +3488,10 @@ int ext4_force_commit(struct super_block *sb)
return 0;
journal = EXT4_SB(sb)->s_journal;
- if (journal)
+ if (journal) {
+ vfs_check_frozen(sb, SB_FREEZE_WRITE);
ret = ext4_journal_force_commit(journal);
+ }
return ret;
}
@@ -3535,18 +3540,16 @@ static int ext4_freeze(struct super_block *sb)
* the journal.
*/
error = jbd2_journal_flush(journal);
- if (error < 0) {
- out:
- jbd2_journal_unlock_updates(journal);
- return error;
- }
+ if (error < 0)
+ goto out;
/* Journal blocked and flushed, clear needs_recovery flag. */
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
error = ext4_commit_super(sb, 1);
- if (error)
- goto out;
- return 0;
+out:
+ /* we rely on s_frozen to stop further updates */
+ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ return error;
}
/*
@@ -3563,7 +3566,6 @@ static int ext4_unfreeze(struct super_block *sb)
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
ext4_commit_super(sb, 1);
unlock_super(sb);
- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
return 0;
}
@@ -3580,6 +3582,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#ifdef CONFIG_QUOTA
int i;
#endif
+ char *orig_data = kstrdup(data, GFP_KERNEL);
lock_kernel();
@@ -3713,6 +3716,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
#endif
unlock_super(sb);
unlock_kernel();
+
+ ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+ kfree(orig_data);
return 0;
restore_opts:
@@ -3734,6 +3740,7 @@ restore_opts:
#endif
unlock_super(sb);
unlock_kernel();
+ kfree(orig_data);
return err;
}
@@ -4141,6 +4148,7 @@ static int __init init_ext4_fs(void)
{
int err;
+ ext4_check_flag_values();
err = init_ext4_system_zone();
if (err)
return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
+ .setattr = ext4_setattr,
#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext4_follow_link,
+ .setattr = ext4_setattr,
#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2de0e9515089..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext4_xattr_check_block(bh)) {
bad_block:
- ext4_error(inode->i_sb,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
ea_bdebug(bh, "b_count=%d, refcount=%d",
atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
if (ext4_xattr_check_block(bh)) {
- ext4_error(inode->i_sb,
- "inode %lu: bad block %llu", inode->i_ino,
- EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
if (ext4_xattr_check_block(bs->bh)) {
- ext4_error(sb, "inode %lu: bad block %llu",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -820,7 +818,7 @@ inserted:
EXT4_I(inode)->i_block_group);
/* non-extent files can't have physical blocks past 2^32 */
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
if (error)
goto cleanup;
- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
goto cleanup;
bad_block:
- ext4_error(inode->i_sb, "inode %lu: bad block %llu",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
#undef header
@@ -1194,8 +1192,8 @@ retry:
if (!bh)
goto cleanup;
if (ext4_xattr_check_block(bh)) {
- ext4_error(inode->i_sb, "inode %lu: bad block %llu",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
error = -EIO;
goto cleanup;
}
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
goto cleanup;
bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
if (!bh) {
- ext4_error(inode->i_sb, "inode %lu: block %llu read error",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "block %llu read error",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
}
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1)) {
- ext4_error(inode->i_sb, "inode %lu: bad block %llu",
- inode->i_ino, EXT4_I(inode)->i_file_acl);
+ EXT4_ERROR_INODE(inode, "bad block %llu",
+ EXT4_I(inode)->i_file_acl);
goto cleanup;
}
ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
}
bh = sb_bread(inode->i_sb, ce->e_block);
if (!bh) {
- ext4_error(inode->i_sb,
- "inode %lu: block %lu read error",
- inode->i_ino, (unsigned long) ce->e_block);
+ EXT4_ERROR_INODE(inode, "block %lu read error",
+ (unsigned long) ce->e_block);
} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
EXT4_XATTR_REFCOUNT_MAX) {
ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
};
const struct file_operations vxfs_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
.readdir = vxfs_readdir,
};
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
/* banners (can't represent line 0 by pos 0 as that would involve
* returning a NULL pointer) */
if (pos == 0)
- return (struct fscache_object *) ++(*_pos);
+ return (struct fscache_object *)(long)++(*_pos);
if (pos < 3)
return (struct fscache_object *)pos;
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
const struct file_operations isofs_dir_operations =
{
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = isofs_readdir,
};
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..e214d68620ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
if (handle->h_sync)
transaction->t_synchronous_commit = 1;
current->journal_info = NULL;
- spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock);
transaction->t_outstanding_credits -= handle->h_buffer_credits;
transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
jbd_debug(2, "transaction too old, requesting commit for "
"handle %p\n", handle);
/* This is non-blocking */
- __jbd2_log_start_commit(journal, transaction->t_tid);
- spin_unlock(&journal->j_state_lock);
+ jbd2_log_start_commit(journal, transaction->t_tid);
/*
* Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
err = jbd2_log_wait_commit(journal, tid);
} else {
spin_unlock(&transaction->t_handle_lock);
- spin_unlock(&journal->j_state_lock);
}
lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 92dde6f8d893..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,6 +49,7 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
const struct file_operations ncp_dir_operations =
{
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = ncp_readdir,
.unlocked_ioctl = ncp_ioctl,
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ee9a179ebdf3..db64854b7b09 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1741,6 +1741,7 @@ remove_lru_entry:
clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
smp_mb__after_clear_bit();
}
+ spin_unlock(&inode->i_lock);
}
spin_unlock(&nfs_access_lru_lock);
nfs_access_free_list(&head);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..91679e2631ee 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1386,7 +1386,7 @@ static int nfs_commit_inode(struct inode *inode, int how)
int res = 0;
if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
- goto out;
+ goto out_mark_dirty;
spin_lock(&inode->i_lock);
res = nfs_scan_commit(inode, &head, 0, 0);
spin_unlock(&inode->i_lock);
@@ -1398,9 +1398,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
nfs_wait_bit_killable,
TASK_KILLABLE);
+ else
+ goto out_mark_dirty;
} else
nfs_commit_clear_lock(NFS_I(inode));
-out:
+ return res;
+ /* Note: If we exit without ensuring that the commit is complete,
+ * we must mark the inode as dirty. Otherwise, future calls to
+ * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+ * that the data is on the disk.
+ */
+out_mark_dirty:
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return res;
}
@@ -1509,14 +1518,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
};
int ret;
- while(PagePrivate(page)) {
+ for (;;) {
wait_on_page_writeback(page);
if (clear_page_dirty_for_io(page)) {
ret = nfs_writepage_locked(page, &wbc);
if (ret < 0)
goto out_error;
+ continue;
}
- ret = sync_inode(inode, &wbc);
+ if (!PagePrivate(page))
+ break;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
if (ret < 0)
goto out_error;
}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..9b58d38bc911 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
shpending = p->signal->shared_pending.signal;
blocked = p->blocked;
collect_sigign_sigcatch(p, &ignored, &caught);
- num_threads = atomic_read(&p->signal->count);
+ num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */
qsize = atomic_read(&__task_cred(p)->user->sigpending);
rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
tty_nr = new_encode_dev(tty_devnum(sig->tty));
}
- num_threads = atomic_read(&sig->count);
+ num_threads = get_nr_threads(task);
collect_sigign_sigcatch(task, &sigign, &sigcatch);
cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f9f23449dc..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
return result;
}
-static int get_nr_threads(struct task_struct *tsk)
-{
- unsigned long flags;
- int count = 0;
-
- if (lock_task_sighand(tsk, &flags)) {
- count = atomic_read(&tsk->signal->count);
- unlock_task_sighand(tsk, &flags);
- }
- return count;
-}
-
static int proc_cwd_link(struct inode *inode, struct path *path)
{
struct task_struct *task = get_proc_task(inode);
@@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
const struct pid_entry *p = ptr;
struct inode *inode;
struct proc_inode *ei;
- struct dentry *error = ERR_PTR(-EINVAL);
+ struct dentry *error;
/* Allocate the inode */
error = ERR_PTR(-ENOMEM);
@@ -2794,7 +2782,7 @@ out:
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
{
- struct dentry *result = ERR_PTR(-ENOENT);
+ struct dentry *result;
struct task_struct *task;
unsigned tgid;
struct pid_namespace *ns;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
/*
* Return an inode number between PROC_DYNAMIC_FIRST and
* 0xffffffff, or zero on failure.
- *
- * Current inode allocations in the proc-fs (hex-numbers):
- *
- * 00000000 reserved
- * 00000001-00000fff static entries (goners)
- * 001 root-ino
- *
- * 00001000-00001fff unused
- * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
- * 80000000-efffffff unused
- * f0000000-ffffffff dynamic entries
- *
- * Goal:
- * Once we split the thing into several virtual filesystems,
- * we will get rid of magical ranges (and this comment, BTW).
*/
static unsigned int get_inode_number(void)
{
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c837a77351be..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -588,7 +588,7 @@ static struct kcore_list kcore_text;
*/
static void __init proc_kcore_text_init(void)
{
- kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+ kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
}
#else
static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
if (err)
return;
proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
- err = PTR_ERR(proc_mnt);
if (IS_ERR(proc_mnt)) {
unregister_filesystem(&proc_fs_type);
return;
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..3d3fd4692133 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,6 +77,7 @@ out:
const struct file_operations qnx4_dir_operations =
{
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = qnx4_readdir,
.fsync = simple_fsync,
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 655a4c52b8c3..1ad8bf076cfc 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1514,11 +1514,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
/*
* This operation can block, but only after everything is updated
*/
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
- int warn, int reserve)
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
int cnt, ret = 0;
char warntype[MAXQUOTAS];
+ int warn = flags & DQUOT_SPACE_WARN;
+ int reserve = flags & DQUOT_SPACE_RESERVE;
+ int nofail = flags & DQUOT_SPACE_NOFAIL;
/*
* First test before acquiring mutex - solves deadlocks when we
@@ -1539,7 +1541,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
continue;
ret = check_bdq(inode->i_dquot[cnt], number, !warn,
warntype+cnt);
- if (ret) {
+ if (ret && !nofail) {
spin_unlock(&dq_data_lock);
goto out_flush_warn;
}
@@ -1638,10 +1640,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
/*
* This operation can block, but only after everything is updated
*/
-void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
+ int reserve = flags & DQUOT_SPACE_RESERVE;
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
}
EXPORT_SYMBOL(generic_file_llseek);
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file: file structure to seek on
+ * @offset: file offset to seek to
+ * @origin: type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+{
+ return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
+
loff_t no_llseek(struct file *file, loff_t offset, int origin)
{
return -ESPIPE;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..4455fbe269a3 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -18,6 +18,7 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
int datasync);
const struct file_operations reiserfs_dir_operations = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = reiserfs_readdir,
.fsync = reiserfs_dir_fsync,
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 6c978428892d..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,6 +37,7 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
const struct file_operations smb_dir_operations =
{
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = smb_readdir,
.unlocked_ioctl = smb_ioctl,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
If unsure, say N.
+config SQUASHFS_XATTRS
+ bool "Squashfs XATTR support"
+ depends on SQUASHFS
+ default n
+ help
+ Saying Y here includes support for extended attributes (xattrs).
+ Xattrs are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page).
+
+ If unsure, say N.
+
config SQUASHFS_EMBEDDED
bool "Additional option for memory-constrained systems"
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
obj-$(CONFIG_SQUASHFS) += squashfs.o
squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
+
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
#include <linux/fs.h>
#include <linux/vfs.h>
+#include <linux/xattr.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "xattr.h"
/*
* Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
union squashfs_inode squashfs_ino;
struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+ int xattr_id = SQUASHFS_INVALID_XATTR;
TRACE("Entered squashfs_read_inode\n");
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
frag_offset = 0;
}
+ xattr_id = le32_to_cpu(sqsh_ino->xattr);
inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+ inode->i_op = &squashfs_inode_ops;
inode->i_fop = &generic_ro_fops;
inode->i_mode |= S_IFREG;
inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
if (err < 0)
goto failed_read;
+ xattr_id = le32_to_cpu(sqsh_ino->xattr);
inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
inode->i_size = le32_to_cpu(sqsh_ino->file_size);
inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
- inode->i_op = &page_symlink_inode_operations;
+ inode->i_op = &squashfs_symlink_inode_ops;
inode->i_data.a_ops = &squashfs_symlink_aops;
inode->i_mode |= S_IFLNK;
squashfs_i(inode)->start = block;
squashfs_i(inode)->offset = offset;
+ if (type == SQUASHFS_LSYMLINK_TYPE) {
+ __le32 xattr;
+
+ err = squashfs_read_metadata(sb, NULL, &block,
+ &offset, inode->i_size);
+ if (err < 0)
+ goto failed_read;
+ err = squashfs_read_metadata(sb, &xattr, &block,
+ &offset, sizeof(xattr));
+ if (err < 0)
+ goto failed_read;
+ xattr_id = le32_to_cpu(xattr);
+ }
+
TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
"%x\n", SQUASHFS_INODE_BLK(ino), offset,
block, offset);
break;
}
case SQUASHFS_BLKDEV_TYPE:
- case SQUASHFS_CHRDEV_TYPE:
- case SQUASHFS_LBLKDEV_TYPE:
- case SQUASHFS_LCHRDEV_TYPE: {
+ case SQUASHFS_CHRDEV_TYPE: {
struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
unsigned int rdev;
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
SQUASHFS_INODE_BLK(ino), offset, rdev);
break;
}
+ case SQUASHFS_LBLKDEV_TYPE:
+ case SQUASHFS_LCHRDEV_TYPE: {
+ struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+ unsigned int rdev;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ if (type == SQUASHFS_LCHRDEV_TYPE)
+ inode->i_mode |= S_IFCHR;
+ else
+ inode->i_mode |= S_IFBLK;
+ xattr_id = le32_to_cpu(sqsh_ino->xattr);
+ inode->i_op = &squashfs_inode_ops;
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ rdev = le32_to_cpu(sqsh_ino->rdev);
+ init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+
+ TRACE("Device inode %x:%x, rdev %x\n",
+ SQUASHFS_INODE_BLK(ino), offset, rdev);
+ break;
+ }
case SQUASHFS_FIFO_TYPE:
- case SQUASHFS_SOCKET_TYPE:
- case SQUASHFS_LFIFO_TYPE:
- case SQUASHFS_LSOCKET_TYPE: {
+ case SQUASHFS_SOCKET_TYPE: {
struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
init_special_inode(inode, inode->i_mode, 0);
break;
}
+ case SQUASHFS_LFIFO_TYPE:
+ case SQUASHFS_LSOCKET_TYPE: {
+ struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+
+ err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+ sizeof(*sqsh_ino));
+ if (err < 0)
+ goto failed_read;
+
+ if (type == SQUASHFS_LFIFO_TYPE)
+ inode->i_mode |= S_IFIFO;
+ else
+ inode->i_mode |= S_IFSOCK;
+ xattr_id = le32_to_cpu(sqsh_ino->xattr);
+ inode->i_op = &squashfs_inode_ops;
+ inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+ init_special_inode(inode, inode->i_mode, 0);
+ break;
+ }
default:
ERROR("Unknown inode type %d in squashfs_iget!\n", type);
return -EINVAL;
}
+ if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+ err = squashfs_xattr_lookup(sb, xattr_id,
+ &squashfs_i(inode)->xattr_count,
+ &squashfs_i(inode)->xattr_size,
+ &squashfs_i(inode)->xattr);
+ if (err < 0)
+ goto failed_read;
+ inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+ + 1;
+ } else
+ squashfs_i(inode)->xattr_count = 0;
+
return 0;
failed_read:
ERROR("Unable to read inode 0x%llx\n", ino);
return err;
}
+
+
+const struct inode_operations squashfs_inode_ops = {
+ .getxattr = generic_getxattr,
+ .listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/dcache.h>
+#include <linux/xattr.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "xattr.h"
/*
* Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
const struct inode_operations squashfs_dir_inode_ops = {
- .lookup = squashfs_lookup
+ .lookup = squashfs_lookup,
+ .getxattr = generic_getxattr,
+ .listxattr = squashfs_listxattr
};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
unsigned int);
extern int squashfs_read_inode(struct inode *, long long);
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
+
/*
- * Inodes, files and decompressor operations
+ * Inodes, files, decompressor and xattr operations
*/
/* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
/* file.c */
extern const struct address_space_operations squashfs_aops;
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
+
/* namei.c */
extern const struct inode_operations squashfs_dir_inode_ops;
/* symlink.c */
extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
/* zlib_wrapper.c */
extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
#define SQUASHFS_NAME_LEN 256
#define SQUASHFS_INVALID_FRAG (0xffffffffU)
+#define SQUASHFS_INVALID_XATTR (0xffffffffU)
#define SQUASHFS_INVALID_BLK (-1LL)
/* Filesystem flags */
@@ -96,6 +97,13 @@
#define SQUASHFS_LFIFO_TYPE 13
#define SQUASHFS_LSOCKET_TYPE 14
+/* Xattr types */
+#define SQUASHFS_XATTR_USER 0
+#define SQUASHFS_XATTR_TRUSTED 1
+#define SQUASHFS_XATTR_SECURITY 2
+#define SQUASHFS_XATTR_VALUE_OOL 256
+#define SQUASHFS_XATTR_PREFIX_MASK 0xff
+
/* Flag whether block is compressed or uncompressed, bit is set if block is
* uncompressed */
#define SQUASHFS_COMPRESSED_BIT (1 << 15)
@@ -174,6 +182,24 @@
#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
+
+#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \
+ SQUASHFS_METADATA_SIZE - 1) / \
+ SQUASHFS_METADATA_SIZE)
+
+#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\
+ sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16))
+
+#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff))
/* cached data constants for filesystem */
#define SQUASHFS_CACHED_BLKS 8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
__le64 root_inode;
__le64 bytes_used;
__le64 id_table_start;
- __le64 xattr_table_start;
+ __le64 xattr_id_table_start;
__le64 inode_table_start;
__le64 directory_table_start;
__le64 fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
__le32 nlink;
};
+struct squashfs_lipc_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 nlink;
+ __le32 xattr;
+};
+
struct squashfs_dev_inode {
__le16 inode_type;
__le16 mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
__le32 rdev;
};
+struct squashfs_ldev_inode {
+ __le16 inode_type;
+ __le16 mode;
+ __le16 uid;
+ __le16 guid;
+ __le32 mtime;
+ __le32 inode_number;
+ __le32 nlink;
+ __le32 rdev;
+ __le32 xattr;
+};
+
struct squashfs_symlink_inode {
__le16 inode_type;
__le16 mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
union squashfs_inode {
struct squashfs_base_inode base;
struct squashfs_dev_inode dev;
+ struct squashfs_ldev_inode ldev;
struct squashfs_symlink_inode symlink;
struct squashfs_reg_inode reg;
struct squashfs_lreg_inode lreg;
struct squashfs_dir_inode dir;
struct squashfs_ldir_inode ldir;
struct squashfs_ipc_inode ipc;
+ struct squashfs_lipc_inode lipc;
};
struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
unsigned int unused;
};
+struct squashfs_xattr_entry {
+ __le16 type;
+ __le16 size;
+ char data[0];
+};
+
+struct squashfs_xattr_val {
+ __le32 vsize;
+ char value[0];
+};
+
+struct squashfs_xattr_id {
+ __le64 xattr;
+ __le32 count;
+ __le32 size;
+};
+
+struct squashfs_xattr_id_table {
+ __le64 xattr_table_start;
+ __le32 xattr_ids;
+ __le32 unused;
+};
+
#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
struct squashfs_inode_info {
u64 start;
int offset;
+ u64 xattr;
+ unsigned int xattr_size;
+ int xattr_count;
union {
struct {
u64 fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
int next_meta_index;
__le64 *id_table;
__le64 *fragment_index;
+ __le64 *xattr_id_table;
struct mutex read_data_mutex;
struct mutex meta_index_mutex;
struct meta_index *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
__le64 *inode_lookup_table;
u64 inode_table;
u64 directory_table;
+ u64 xattr_table;
unsigned int block_size;
unsigned short block_log;
long long bytes_used;
unsigned int inodes;
+ int xattr_ids;
};
#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/magic.h>
+#include <linux/xattr.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
#include "decompressor.h"
+#include "xattr.h"
static struct file_system_type squashfs_fs_type;
static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
long long root_inode;
unsigned short flags;
unsigned int fragments;
- u64 lookup_table_start;
+ u64 lookup_table_start, xattr_id_table_start;
int err;
TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
if (msblk->decompressor == NULL)
goto failed_mount;
- /*
- * Check if there's xattrs in the filesystem. These are not
- * supported in this version, so warn that they will be ignored.
- */
- if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
- ERROR("Xattrs in filesystem, these will be ignored\n");
-
/* Check the filesystem does not extend beyond the end of the
block device */
msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
allocate_lookup_table:
lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
if (lookup_table_start == SQUASHFS_INVALID_BLK)
- goto allocate_root;
+ goto allocate_xattr_table;
/* Allocate and read inode lookup table */
msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
sb->s_export_op = &squashfs_export_ops;
+allocate_xattr_table:
+ sb->s_xattr = squashfs_xattr_handlers;
+ xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+ if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
+ goto allocate_root;
+
+ /* Allocate and read xattr id lookup table */
+ msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+ xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+ if (IS_ERR(msblk->xattr_id_table)) {
+ err = PTR_ERR(msblk->xattr_id_table);
+ msblk->xattr_id_table = NULL;
+ if (err != -ENOTSUPP)
+ goto failed_mount;
+ }
allocate_root:
root = new_inode(sb);
if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
kfree(msblk->inode_lookup_table);
kfree(msblk->fragment_index);
kfree(msblk->id_table);
+ kfree(msblk->xattr_id_table);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
kfree(sbi->fragment_index);
kfree(sbi->meta_index);
kfree(sbi->inode_lookup_table);
+ kfree(sbi->xattr_id_table);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
}
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/pagemap.h>
+#include <linux/xattr.h>
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs_fs_i.h"
#include "squashfs.h"
+#include "xattr.h"
static int squashfs_symlink_readpage(struct file *file, struct page *page)
{
@@ -114,3 +116,12 @@ error_out:
const struct address_space_operations squashfs_symlink_aops = {
.readpage = squashfs_symlink_readpage
};
+
+const struct inode_operations squashfs_symlink_inode_ops = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+ .getxattr = generic_getxattr,
+ .listxattr = squashfs_listxattr
+};
+
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+static const struct xattr_handler *squashfs_xattr_handler(int);
+
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+ size_t buffer_size)
+{
+ struct inode *inode = d->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+ + msblk->xattr_table;
+ int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+ int count = squashfs_i(inode)->xattr_count;
+ size_t rest = buffer_size;
+ int err;
+
+ /* check that the file system has xattrs */
+ if (msblk->xattr_id_table == NULL)
+ return -EOPNOTSUPP;
+
+ /* loop reading each xattr name */
+ while (count--) {
+ struct squashfs_xattr_entry entry;
+ struct squashfs_xattr_val val;
+ const struct xattr_handler *handler;
+ int name_size, prefix_size = 0;
+
+ err = squashfs_read_metadata(sb, &entry, &start, &offset,
+ sizeof(entry));
+ if (err < 0)
+ goto failed;
+
+ name_size = le16_to_cpu(entry.size);
+ handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+ if (handler)
+ prefix_size = handler->list(d, buffer, rest, NULL,
+ name_size, handler->flags);
+ if (prefix_size) {
+ if (buffer) {
+ if (prefix_size + name_size + 1 > rest) {
+ err = -ERANGE;
+ goto failed;
+ }
+ buffer += prefix_size;
+ }
+ err = squashfs_read_metadata(sb, buffer, &start,
+ &offset, name_size);
+ if (err < 0)
+ goto failed;
+ if (buffer) {
+ buffer[name_size] = '\0';
+ buffer += name_size + 1;
+ }
+ rest -= prefix_size + name_size + 1;
+ } else {
+ /* no handler or insuffficient privileges, so skip */
+ err = squashfs_read_metadata(sb, NULL, &start,
+ &offset, name_size);
+ if (err < 0)
+ goto failed;
+ }
+
+
+ /* skip remaining xattr entry */
+ err = squashfs_read_metadata(sb, &val, &start, &offset,
+ sizeof(val));
+ if (err < 0)
+ goto failed;
+
+ err = squashfs_read_metadata(sb, NULL, &start, &offset,
+ le32_to_cpu(val.vsize));
+ if (err < 0)
+ goto failed;
+ }
+ err = buffer_size - rest;
+
+failed:
+ return err;
+}
+
+
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+ const char *name, void *buffer, size_t buffer_size)
+{
+ struct super_block *sb = inode->i_sb;
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+ + msblk->xattr_table;
+ int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+ int count = squashfs_i(inode)->xattr_count;
+ int name_len = strlen(name);
+ int err, vsize;
+ char *target = kmalloc(name_len, GFP_KERNEL);
+
+ if (target == NULL)
+ return -ENOMEM;
+
+ /* loop reading each xattr name */
+ for (; count; count--) {
+ struct squashfs_xattr_entry entry;
+ struct squashfs_xattr_val val;
+ int type, prefix, name_size;
+
+ err = squashfs_read_metadata(sb, &entry, &start, &offset,
+ sizeof(entry));
+ if (err < 0)
+ goto failed;
+
+ name_size = le16_to_cpu(entry.size);
+ type = le16_to_cpu(entry.type);
+ prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+
+ if (prefix == name_index && name_size == name_len)
+ err = squashfs_read_metadata(sb, target, &start,
+ &offset, name_size);
+ else
+ err = squashfs_read_metadata(sb, NULL, &start,
+ &offset, name_size);
+ if (err < 0)
+ goto failed;
+
+ if (prefix == name_index && name_size == name_len &&
+ strncmp(target, name, name_size) == 0) {
+ /* found xattr */
+ if (type & SQUASHFS_XATTR_VALUE_OOL) {
+ __le64 xattr;
+ /* val is a reference to the real location */
+ err = squashfs_read_metadata(sb, &val, &start,
+ &offset, sizeof(val));
+ if (err < 0)
+ goto failed;
+ err = squashfs_read_metadata(sb, &xattr, &start,
+ &offset, sizeof(xattr));
+ if (err < 0)
+ goto failed;
+ xattr = le64_to_cpu(xattr);
+ start = SQUASHFS_XATTR_BLK(xattr) +
+ msblk->xattr_table;
+ offset = SQUASHFS_XATTR_OFFSET(xattr);
+ }
+ /* read xattr value */
+ err = squashfs_read_metadata(sb, &val, &start, &offset,
+ sizeof(val));
+ if (err < 0)
+ goto failed;
+
+ vsize = le32_to_cpu(val.vsize);
+ if (buffer) {
+ if (vsize > buffer_size) {
+ err = -ERANGE;
+ goto failed;
+ }
+ err = squashfs_read_metadata(sb, buffer, &start,
+ &offset, vsize);
+ if (err < 0)
+ goto failed;
+ }
+ break;
+ }
+
+ /* no match, skip remaining xattr entry */
+ err = squashfs_read_metadata(sb, &val, &start, &offset,
+ sizeof(val));
+ if (err < 0)
+ goto failed;
+ err = squashfs_read_metadata(sb, NULL, &start, &offset,
+ le32_to_cpu(val.vsize));
+ if (err < 0)
+ goto failed;
+ }
+ err = count ? vsize : -ENODATA;
+
+failed:
+ kfree(target);
+ return err;
+}
+
+
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
+{
+ if (list && XATTR_USER_PREFIX_LEN <= list_size)
+ memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+ return XATTR_USER_PREFIX_LEN;
+}
+
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+ size_t size, int type)
+{
+ if (name[0] == '\0')
+ return -EINVAL;
+
+ return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+ buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .list = squashfs_user_list,
+ .get = squashfs_user_get
+};
+
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
+ if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+ memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+ return XATTR_TRUSTED_PREFIX_LEN;
+}
+
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+ void *buffer, size_t size, int type)
+{
+ if (name[0] == '\0')
+ return -EINVAL;
+
+ return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+ buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = squashfs_trusted_list,
+ .get = squashfs_trusted_get
+};
+
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+ memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+ return XATTR_SECURITY_PREFIX_LEN;
+}
+
+static int squashfs_security_get(struct dentry *d, const char *name,
+ void *buffer, size_t size, int type)
+{
+ if (name[0] == '\0')
+ return -EINVAL;
+
+ return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+ buffer, size);
+}
+
+static const struct xattr_handler squashfs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = squashfs_security_list,
+ .get = squashfs_security_get
+};
+
+static inline const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+ if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+ /* ignore unrecognised type */
+ return NULL;
+
+ switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+ case SQUASHFS_XATTR_USER:
+ return &squashfs_xattr_user_handler;
+ case SQUASHFS_XATTR_TRUSTED:
+ return &squashfs_xattr_trusted_handler;
+ case SQUASHFS_XATTR_SECURITY:
+ return &squashfs_xattr_security_handler;
+ default:
+ /* ignore unrecognised type */
+ return NULL;
+ }
+}
+
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+ &squashfs_xattr_user_handler,
+ &squashfs_xattr_trusted_handler,
+ &squashfs_xattr_security_handler,
+ NULL
+};
+
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+
+#ifdef CONFIG_SQUASHFS_XATTRS
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+ u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+ int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+ u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+ ERROR("Xattrs in filesystem, these will be ignored\n");
+ return ERR_PTR(-ENOTSUPP);
+}
+
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+ unsigned int index, int *count, int *size,
+ unsigned long long *xattr)
+{
+ return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+ int *count, unsigned int *size, unsigned long long *xattr)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ int block = SQUASHFS_XATTR_BLOCK(index);
+ int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+ u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+ struct squashfs_xattr_id id;
+ int err;
+
+ err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+ sizeof(id));
+ if (err < 0)
+ return err;
+
+ *xattr = le64_to_cpu(id.xattr);
+ *size = le32_to_cpu(id.size);
+ *count = le32_to_cpu(id.count);
+ return 0;
+}
+
+
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+ u64 *xattr_table_start, int *xattr_ids)
+{
+ unsigned int len;
+ __le64 *xid_table;
+ struct squashfs_xattr_id_table id_table;
+ int err;
+
+ err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
+ if (err < 0) {
+ ERROR("unable to read xattr id table\n");
+ return ERR_PTR(err);
+ }
+ *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
+ *xattr_ids = le32_to_cpu(id_table.xattr_ids);
+ len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+
+ TRACE("In read_xattr_index_table, length %d\n", len);
+
+ /* Allocate xattr id lookup table indexes */
+ xid_table = kmalloc(len, GFP_KERNEL);
+ if (xid_table == NULL) {
+ ERROR("Failed to allocate xattr id index table\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
+ if (err < 0) {
+ ERROR("unable to read xattr id index table\n");
+ kfree(xid_table);
+ return ERR_PTR(err);
+ }
+
+ return xid_table;
+}
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 3a84455c2a77..1660c81ffa3d 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,6 +207,7 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
/* readdir and lookup functions */
const struct file_operations udf_dir_operations = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = udf_readdir,
.unlocked_ioctl = udf_ioctl,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..ad9bc1ebd3a6 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -918,6 +918,7 @@ again:
sbi->s_bytesex = BYTESEX_LE;
switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
case UFS_MAGIC:
+ case UFS_MAGIC_BW:
case UFS2_MAGIC:
case UFS_MAGIC_LFN:
case UFS_MAGIC_FEA:
@@ -927,6 +928,7 @@ again:
sbi->s_bytesex = BYTESEX_BE;
switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
case UFS_MAGIC:
+ case UFS_MAGIC_BW:
case UFS2_MAGIC:
case UFS_MAGIC_LFN:
case UFS_MAGIC_FEA:
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
#define UFS_SECTOR_SIZE 512
#define UFS_SECTOR_BITS 9
#define UFS_MAGIC 0x00011954
+#define UFS_MAGIC_BW 0x0f242697
#define UFS2_MAGIC 0x19540119
#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */