From de0ee0edb21fbab4c7afa3e94573ecfebfb0244e Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 21 Jan 2016 10:17:54 +0000
Subject: Btrfs: fix race between fsync and lockless direct IO writes

An fsync, using the fast path, can race with a concurrent lockless direct
IO write and end up logging a file extent item that points to an extent
that wasn't written to yet. This is because the fast fsync path collects
ordered extents into a local list and then collects all the new extent
maps to log file extent items based on them, while the direct IO write
path creates the new extent map before it creates the corresponding
ordered extent (and submitting the respective bio(s)).

So fix this by making the direct IO write path create ordered extents
before the extent maps and make the fast fsync path collect any new
ordered extents after it collects the extent maps.
Note that making the fsync handler call inode_dio_wait() (after acquiring
the inode's i_mutex) would not work and lead to a deadlock when doing
AIO, as through AIO we end up in a path where the fsync handler is called
(through dio_aio_complete_work() -> dio_complete() -> vfs_fsync_range())
before the inode's dio counter is decremented (inode_dio_wait() waits
for this counter to have a value of zero).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9d2f..978c3a810893 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 				     struct inode *inode,
 				     struct btrfs_path *path,
 				     struct list_head *logged_list,
-				     struct btrfs_log_ctx *ctx)
+				     struct btrfs_log_ctx *ctx,
+				     const u64 start,
+				     const u64 end)
 {
 	struct extent_map *em, *n;
 	struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	}
 
 	list_sort(NULL, &extents, extent_cmp);
-
+	/*
+	 * Collect any new ordered extents within the range. This is to
+	 * prevent logging file extent items without waiting for the disk
+	 * location they point to being written. We do this only to deal
+	 * with races against concurrent lockless direct IO writes.
+	 */
+	btrfs_get_logged_extents(inode, logged_list, start, end);
 process:
 	while (!list_empty(&extents)) {
 		em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
 			goto out_unlock;
 		}
 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-						&logged_list, ctx);
+						&logged_list, ctx, start, end);
 		if (ret) {
 			err = ret;
 			goto out_unlock;
-- 
cgit v1.2.3


From 1ec9a1ae1e30c733077c0b288c4301b66b7a81f2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 10 Feb 2016 10:42:25 +0000
Subject: Btrfs: fix unreplayable log after snapshot delete + parent dir fsync

If we delete a snapshot, fsync its parent directory and crash/power fail
before the next transaction commit, on the next mount when we attempt to
replay the log tree of the root containing the parent directory we will
fail and prevent the filesystem from mounting, which is solvable by wiping
out the log trees with the btrfs-zero-log tool but very inconvenient as
we will lose any data and metadata fsynced before the parent directory
was fsynced.

For example:

  $ mkfs.btrfs -f /dev/sdc
  $ mount /dev/sdc /mnt
  $ mkdir /mnt/testdir
  $ btrfs subvolume snapshot /mnt /mnt/testdir/snap
  $ btrfs subvolume delete /mnt/testdir/snap
  $ xfs_io -c "fsync" /mnt/testdir
  < crash / power failure and reboot >
  $ mount /dev/sdc /mnt
  mount: mount(2) failed: No such file or directory

And in dmesg/syslog we get the following message and trace:

[192066.361162] BTRFS info (device dm-0): failed to delete reference to snap, inode 257 parent 257
[192066.363010] ------------[ cut here ]------------
[192066.365268] WARNING: CPU: 4 PID: 5130 at fs/btrfs/inode.c:3986 __btrfs_unlink_inode+0x17a/0x354 [btrfs]()
[192066.367250] BTRFS: Transaction aborted (error -2)
[192066.368401] Modules linked in: btrfs dm_flakey dm_mod ppdev sha256_generic xor raid6_pq hmac drbg ansi_cprng aesni_intel acpi_cpufreq tpm_tis aes_x86_64 tpm ablk_helper evdev cryptd sg parport_pc i2c_piix4 psmouse lrw parport i2c_core pcspkr gf128mul processor serio_raw glue_helper button loop autofs4 ext4 crc16 mbcache jbd2 sd_mod sr_mod cdrom ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel scsi_mod e1000 virtio floppy [last unloaded: btrfs]
[192066.377154] CPU: 4 PID: 5130 Comm: mount Tainted: G        W       4.4.0-rc6-btrfs-next-20+ #1
[192066.378875] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
[192066.380889]  0000000000000000 ffff880143923670 ffffffff81257570 ffff8801439236b8
[192066.382561]  ffff8801439236a8 ffffffff8104ec07 ffffffffa039dc2c 00000000fffffffe
[192066.384191]  ffff8801ed31d000 ffff8801b9fc9c88 ffff8801086875e0 ffff880143923710
[192066.385827] Call Trace:
[192066.386373]  [<ffffffff81257570>] dump_stack+0x4e/0x79
[192066.387387]  [<ffffffff8104ec07>] warn_slowpath_common+0x99/0xb2
[192066.388429]  [<ffffffffa039dc2c>] ? __btrfs_unlink_inode+0x17a/0x354 [btrfs]
[192066.389236]  [<ffffffff8104ec68>] warn_slowpath_fmt+0x48/0x50
[192066.389884]  [<ffffffffa039dc2c>] __btrfs_unlink_inode+0x17a/0x354 [btrfs]
[192066.390621]  [<ffffffff81184b55>] ? iput+0xb0/0x266
[192066.391200]  [<ffffffffa039ea25>] btrfs_unlink_inode+0x1c/0x3d [btrfs]
[192066.391930]  [<ffffffffa03ca623>] check_item_in_log+0x1fe/0x29b [btrfs]
[192066.392715]  [<ffffffffa03ca827>] replay_dir_deletes+0x167/0x1cf [btrfs]
[192066.393510]  [<ffffffffa03cccc7>] replay_one_buffer+0x417/0x570 [btrfs]
[192066.394241]  [<ffffffffa03ca164>] walk_up_log_tree+0x10e/0x1dc [btrfs]
[192066.394958]  [<ffffffffa03cac72>] walk_log_tree+0xa5/0x190 [btrfs]
[192066.395628]  [<ffffffffa03ce8b8>] btrfs_recover_log_trees+0x239/0x32c [btrfs]
[192066.396790]  [<ffffffffa03cc8b0>] ? replay_one_extent+0x50a/0x50a [btrfs]
[192066.397891]  [<ffffffffa0394041>] open_ctree+0x1d8b/0x2167 [btrfs]
[192066.398897]  [<ffffffffa03706e1>] btrfs_mount+0x5ef/0x729 [btrfs]
[192066.399823]  [<ffffffff8108ad98>] ? trace_hardirqs_on+0xd/0xf
[192066.400739]  [<ffffffff8108959b>] ? lockdep_init_map+0xb9/0x1b3
[192066.401700]  [<ffffffff811714b9>] mount_fs+0x67/0x131
[192066.402482]  [<ffffffff81188560>] vfs_kern_mount+0x6c/0xde
[192066.403930]  [<ffffffffa03702bd>] btrfs_mount+0x1cb/0x729 [btrfs]
[192066.404831]  [<ffffffff8108ad98>] ? trace_hardirqs_on+0xd/0xf
[192066.405726]  [<ffffffff8108959b>] ? lockdep_init_map+0xb9/0x1b3
[192066.406621]  [<ffffffff811714b9>] mount_fs+0x67/0x131
[192066.407401]  [<ffffffff81188560>] vfs_kern_mount+0x6c/0xde
[192066.408247]  [<ffffffff8118ae36>] do_mount+0x893/0x9d2
[192066.409047]  [<ffffffff8113009b>] ? strndup_user+0x3f/0x8c
[192066.409842]  [<ffffffff8118b187>] SyS_mount+0x75/0xa1
[192066.410621]  [<ffffffff8147e517>] entry_SYSCALL_64_fastpath+0x12/0x6b
[192066.411572] ---[ end trace 2de42126c1e0a0f0 ]---
[192066.412344] BTRFS: error (device dm-0) in __btrfs_unlink_inode:3986: errno=-2 No such entry
[192066.413748] BTRFS: error (device dm-0) in btrfs_replay_log:2464: errno=-2 No such entry (Failed to recover log tree)
[192066.415458] BTRFS error (device dm-0): cleaner transaction attach returned -30
[192066.444613] BTRFS: open_ctree failed

This happens because when we are replaying the log and processing the
directory entry pointing to the snapshot in the subvolume tree, we treat
its btrfs_dir_item item as having a location with a key type matching
BTRFS_INODE_ITEM_KEY, which is wrong because the type matches
BTRFS_ROOT_ITEM_KEY and therefore must be processed differently, as the
object id refers to a root number and not to an inode in the root
containing the parent directory.

So fix this by triggering a transaction commit if an fsync against the
parent directory is requested after deleting a snapshot. This is the
simplest approach for a rare use case. Some alternative that avoids the
transaction commit would require more code to explicitly delete the
snapshot at log replay time (factoring out common code from ioctl.c:
btrfs_ioctl_snap_destroy()), special care at fsync time to remove the
log tree of the snapshot's root from the log root of the root of tree
roots, amongst other steps.

A test case for xfstests that triggers the issue follows.

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"
  tmp=/tmp/$$
  status=1	# failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
      _cleanup_flakey
      cd /
      rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter
  . ./common/dmflakey

  # real QA test starts here
  _need_to_be_root
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch
  _require_dm_target flakey
  _require_metadata_journaling $SCRATCH_DEV

  rm -f $seqres.full

  _scratch_mkfs >>$seqres.full 2>&1
  _init_flakey
  _mount_flakey

  # Create a snapshot at the root of our filesystem (mount point path), delete it,
  # fsync the mount point path, crash and mount to replay the log. This should
  # succeed and after the filesystem is mounted the snapshot should not be visible
  # anymore.
  _run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/snap1
  _run_btrfs_util_prog subvolume delete $SCRATCH_MNT/snap1
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT
  _flakey_drop_and_remount
  [ -e $SCRATCH_MNT/snap1 ] && \
      echo "Snapshot snap1 still exists after log replay"

  # Similar scenario as above, but this time the snapshot is created inside a
  # directory and not directly under the root (mount point path).
  mkdir $SCRATCH_MNT/testdir
  _run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/testdir/snap2
  _run_btrfs_util_prog subvolume delete $SCRATCH_MNT/testdir/snap2
  $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir
  _flakey_drop_and_remount
  [ -e $SCRATCH_MNT/testdir/snap2 ] && \
      echo "Snapshot snap2 still exists after log replay"

  _unmount_flakey

  echo "Silence is golden"
  status=0
  exit

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Tested-by: Liu Bo <bo.li.liu@oracle.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c    |  3 +++
 fs/btrfs/tree-log.c | 15 +++++++++++++++
 fs/btrfs/tree-log.h |  2 ++
 3 files changed, 20 insertions(+)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 86249cf78897..ccb30ca9ebb2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -59,6 +59,7 @@
 #include "props.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "tree-log.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -2521,6 +2522,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 out_end_trans:
 	trans->block_rsv = NULL;
 	trans->bytes_reserved = 0;
+	if (!err)
+		btrfs_record_snapshot_destroy(trans, dir);
 	ret = btrfs_end_transaction(trans, root);
 	if (ret && !err)
 		err = ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 978c3a810893..43c6781af654 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -5497,6 +5497,21 @@ record:
 	BTRFS_I(dir)->last_unlink_trans = trans->transid;
 }
 
+/*
+ * Make sure that if someone attempts to fsync the parent directory of a deleted
+ * snapshot, it ends up triggering a transaction commit. This is to guarantee
+ * that after replaying the log tree of the parent directory's root we will not
+ * see the snapshot anymore and at log replay time we will not see any log tree
+ * corresponding to the deleted snapshot's root, which could lead to replaying
+ * it after replaying the log tree of the parent directory (which would replay
+ * the snapshot delete operation).
+ */
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+				   struct inode *dir)
+{
+	BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
 /*
  * Call this after adding a new name for a file and it will properly
  * update the log to reflect the new name.
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 6916a781ea02..a9f1b75d080d 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -79,6 +79,8 @@ int btrfs_pin_log_trans(struct btrfs_root *root);
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 			     struct inode *dir, struct inode *inode,
 			     int for_rename);
+void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
+				   struct inode *dir);
 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 			struct inode *inode, struct inode *old_dir,
 			struct dentry *parent);
-- 
cgit v1.2.3


From 2be63d5ce929603d4e7cedabd9e992eb34a0ff95 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 12 Feb 2016 11:34:23 +0000
Subject: Btrfs: fix file loss on log replay after renaming a file and fsync

We have two cases where we end up deleting a file at log replay time
when we should not. For this to happen the file must have been renamed
and a directory inode must have been fsynced/logged.

Two examples that exercise these two cases are listed below.

  Case 1)

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt
  $ mkdir -p /mnt/a/b
  $ mkdir /mnt/c
  $ touch /mnt/a/b/foo
  $ sync
  $ mv /mnt/a/b/foo /mnt/c/
  # Create file bar just to make sure the fsync on directory a/ does
  # something and it's not a no-op.
  $ touch /mnt/a/bar
  $ xfs_io -c "fsync" /mnt/a
  < power fail / crash >

  The next time the filesystem is mounted, the log replay procedure
  deletes file foo.

  Case 2)

  $ mkfs.btrfs -f /dev/sdb
  $ mount /dev/sdb /mnt
  $ mkdir /mnt/a
  $ mkdir /mnt/b
  $ mkdir /mnt/c
  $ touch /mnt/a/foo
  $ ln /mnt/a/foo /mnt/b/foo_link
  $ touch /mnt/b/bar
  $ sync
  $ unlink /mnt/b/foo_link
  $ mv /mnt/b/bar /mnt/c/
  $ xfs_io -c "fsync" /mnt/a/foo
  < power fail / crash >

  The next time the filesystem is mounted, the log replay procedure
  deletes file bar.

The reason why the files are deleted is because when we log inodes
other then the fsync target inode, we ignore their last_unlink_trans
value and leave the log without enough information to later replay the
rename operations. So we need to look at the last_unlink_trans values
and fallback to a transaction commit if they are greater than the
id of the last committed transaction.

So fix this by looking at the last_unlink_trans values and fallback to
transaction commits when needed. Also, when logging other inodes (for
case 1 we logged descendants of the fsync target inode while for case 2
we logged ascendants) we need to care about concurrent tasks updating
the last_unlink_trans of inodes we are logging (which was already an
existing problem in check_parent_dirs_for_sync()). Since we can not
acquire their inode mutex (vfs' struct inode ->i_mutex), as that causes
deadlocks with other concurrent operations that acquire the i_mutex of
2 inodes (other fsyncs or renames for example), we need to serialize on
the log_mutex of the inode we are logging. A task setting a new value for
an inode's last_unlink_trans must acquire the inode's log_mutex and it
must do this update before doing the actual unlink operation (which is
already the case except when deleting a snapshot). Conversely the task
logging the inode must first log the inode and then check the inode's
last_unlink_trans value while holding its log_mutex, as if its value is
not greater then the id of the last committed transaction it means it
logged a safe state of the inode's items, while if its value is not
smaller then the id of the last committed transaction it means the inode
state it has logged might not be safe (the concurrent task might have
just updated last_unlink_trans but hasn't done yet the unlink operation)
and therefore a transaction commit must be done.

Test cases for xfstests follow in separate patches.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c    |  4 ++--
 fs/btrfs/tree-log.c | 67 +++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 59 insertions(+), 12 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index ccb30ca9ebb2..0af5ecbda9a3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2471,6 +2471,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	trans->block_rsv = &block_rsv;
 	trans->bytes_reserved = block_rsv.size;
 
+	btrfs_record_snapshot_destroy(trans, dir);
+
 	ret = btrfs_unlink_subvol(trans, root, dir,
 				dest->root_key.objectid,
 				dentry->d_name.name,
@@ -2522,8 +2524,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 out_end_trans:
 	trans->block_rsv = NULL;
 	trans->bytes_reserved = 0;
-	if (!err)
-		btrfs_record_snapshot_destroy(trans, dir);
 	ret = btrfs_end_transaction(trans, root);
 	if (ret && !err)
 		err = ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 43c6781af654..9f6372dd0eab 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4771,6 +4771,42 @@ out_unlock:
 	return err;
 }
 
+/*
+ * Check if we must fallback to a transaction commit when logging an inode.
+ * This must be called after logging the inode and is used only in the context
+ * when fsyncing an inode requires the need to log some other inode - in which
+ * case we can't lock the i_mutex of each other inode we need to log as that
+ * can lead to deadlocks with concurrent fsync against other inodes (as we can
+ * log inodes up or down in the hierarchy) or rename operations for example. So
+ * we take the log_mutex of the inode after we have logged it and then check for
+ * its last_unlink_trans value - this is safe because any task setting
+ * last_unlink_trans must take the log_mutex and it must do this before it does
+ * the actual unlink operation, so if we do this check before a concurrent task
+ * sets last_unlink_trans it means we've logged a consistent version/state of
+ * all the inode items, otherwise we are not sure and must do a transaction
+ * commit (the concurrent task migth have only updated last_unlink_trans before
+ * we logged the inode or it might have also done the unlink).
+ */
+static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
+					  struct inode *inode)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+	bool ret = false;
+
+	mutex_lock(&BTRFS_I(inode)->log_mutex);
+	if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
+		/*
+		 * Make sure any commits to the log are forced to be full
+		 * commits.
+		 */
+		btrfs_set_log_full_commit(fs_info, trans);
+		ret = true;
+	}
+	mutex_unlock(&BTRFS_I(inode)->log_mutex);
+
+	return ret;
+}
+
 /*
  * follow the dentry parent pointers up the chain and see if any
  * of the directories in it require a full commit before they can
@@ -4784,7 +4820,6 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 					       u64 last_committed)
 {
 	int ret = 0;
-	struct btrfs_root *root;
 	struct dentry *old_parent = NULL;
 	struct inode *orig_inode = inode;
 
@@ -4816,14 +4851,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 			BTRFS_I(inode)->logged_trans = trans->transid;
 		smp_mb();
 
-		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
-			root = BTRFS_I(inode)->root;
-
-			/*
-			 * make sure any commits to the log are forced
-			 * to be full commits
-			 */
-			btrfs_set_log_full_commit(root->fs_info, trans);
+		if (btrfs_must_commit_transaction(trans, inode)) {
 			ret = 1;
 			break;
 		}
@@ -4982,6 +5010,9 @@ process_leaf:
 			btrfs_release_path(path);
 			ret = btrfs_log_inode(trans, root, di_inode,
 					      log_mode, 0, LLONG_MAX, ctx);
+			if (!ret &&
+			    btrfs_must_commit_transaction(trans, di_inode))
+				ret = 1;
 			iput(di_inode);
 			if (ret)
 				goto next_dir_inode;
@@ -5096,6 +5127,9 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
 
 			ret = btrfs_log_inode(trans, root, dir_inode,
 					      LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+			if (!ret &&
+			    btrfs_must_commit_transaction(trans, dir_inode))
+				ret = 1;
 			iput(dir_inode);
 			if (ret)
 				goto out;
@@ -5447,6 +5481,9 @@ error:
  * They revolve around files there were unlinked from the directory, and
  * this function updates the parent directory so that a full commit is
  * properly done if it is fsync'd later after the unlinks are done.
+ *
+ * Must be called before the unlink operations (updates to the subvolume tree,
+ * inodes, etc) are done.
  */
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 			     struct inode *dir, struct inode *inode,
@@ -5462,8 +5499,11 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 	 * into the file.  When the file is logged we check it and
 	 * don't log the parents if the file is fully on disk.
 	 */
-	if (S_ISREG(inode->i_mode))
+	if (S_ISREG(inode->i_mode)) {
+		mutex_lock(&BTRFS_I(inode)->log_mutex);
 		BTRFS_I(inode)->last_unlink_trans = trans->transid;
+		mutex_unlock(&BTRFS_I(inode)->log_mutex);
+	}
 
 	/*
 	 * if this directory was already logged any new
@@ -5494,7 +5534,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 	return;
 
 record:
+	mutex_lock(&BTRFS_I(dir)->log_mutex);
 	BTRFS_I(dir)->last_unlink_trans = trans->transid;
+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
 }
 
 /*
@@ -5505,11 +5547,16 @@ record:
  * corresponding to the deleted snapshot's root, which could lead to replaying
  * it after replaying the log tree of the parent directory (which would replay
  * the snapshot delete operation).
+ *
+ * Must be called before the actual snapshot destroy operation (updates to the
+ * parent root and tree of tree roots trees, etc) are done.
  */
 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
 				   struct inode *dir)
 {
+	mutex_lock(&BTRFS_I(dir)->log_mutex);
 	BTRFS_I(dir)->last_unlink_trans = trans->transid;
+	mutex_unlock(&BTRFS_I(dir)->log_mutex);
 }
 
 /*
-- 
cgit v1.2.3


From 5e33a2bd7ca7fa687fb0965869196eea6815d1f3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 25 Feb 2016 23:19:38 +0000
Subject: Btrfs: do not collect ordered extents when logging that inode exists

When logging that an inode exists, for example as part of a directory
fsync operation, we were collecting any ordered extents for the inode but
we ended up doing nothing with them except tagging them as processed, by
setting the flag BTRFS_ORDERED_LOGGED on them, which prevented a
subsequent fsync of that inode (using the LOG_INODE_ALL mode) from
collecting and processing them. This created a time window where a second
fsync against the inode, using the fast path, ended up not logging the
checksums for the new extents but it logged the extents since they were
part of the list of modified extents. This happened because the ordered
extents were not collected and checksums were not yet added to the csum
tree - the ordered extents have not gone through btrfs_finish_ordered_io()
yet (which is where we add them to the csum tree by calling
inode.c:add_pending_csums()).

So fix this by not collecting an inode's ordered extents if we are logging
it with the LOG_INODE_EXISTS mode.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/tree-log.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9f6372dd0eab..9d2e8ecc5382 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4500,7 +4500,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
-	btrfs_get_logged_extents(inode, &logged_list, start, end);
+	/*
+	 * Collect ordered extents only if we are logging data. This is to
+	 * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
+	 * will process the ordered extents if they still exists at the time,
+	 * because when we collect them we test and set for the flag
+	 * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
+	 * same ordered extents. The consequence for the LOG_INODE_ALL log mode
+	 * not processing the ordered extents is that we end up logging the
+	 * corresponding file extent items, based on the extent maps in the
+	 * inode's extent_map_tree's modified_list, without logging the
+	 * respective checksums (since the may still be only attached to the
+	 * ordered extents and have not been inserted in the csum tree by
+	 * btrfs_finish_ordered_io() yet).
+	 */
+	if (inode_only == LOG_INODE_ALL)
+		btrfs_get_logged_extents(inode, &logged_list, start, end);
 
 	/*
 	 * a brute force approach to making sure we get the most uptodate
-- 
cgit v1.2.3


From ebb8765b2ded869b75bf5154b048119eb52571f7 Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Thu, 10 Mar 2016 17:26:59 +0800
Subject: btrfs: move btrfs_compression_type to compression.h

So that its better organized.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c   | 1 +
 fs/btrfs/compression.h       | 9 +++++++++
 fs/btrfs/ctree.h             | 8 --------
 fs/btrfs/disk-io.c           | 1 +
 fs/btrfs/extent_map.c        | 1 +
 fs/btrfs/file-item.c         | 1 +
 fs/btrfs/file.c              | 1 +
 fs/btrfs/ioctl.c             | 1 +
 fs/btrfs/ordered-data.c      | 1 +
 fs/btrfs/props.c             | 1 +
 fs/btrfs/send.c              | 1 +
 fs/btrfs/tests/inode-tests.c | 1 +
 fs/btrfs/tree-log.c          | 1 +
 13 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 60fc45f3a361..9d226b5d6358 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -106,6 +106,7 @@
 #include "locking.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "compression.h"
 
 #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 13a4dc0436c9..f49d8b8c0f00 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -48,6 +48,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt,
 				   unsigned long pg_index,
 				   unsigned long pg_offset);
+
+enum btrfs_compression_type {
+	BTRFS_COMPRESS_NONE  = 0,
+	BTRFS_COMPRESS_ZLIB  = 1,
+	BTRFS_COMPRESS_LZO   = 2,
+	BTRFS_COMPRESS_TYPES = 2,
+	BTRFS_COMPRESS_LAST  = 3,
+};
+
 struct btrfs_compress_op {
 	struct list_head *(*alloc_workspace)(void);
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b69ad1305b71..0ef28fd2bd27 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -718,14 +718,6 @@ struct btrfs_timespec {
 	__le32 nsec;
 } __attribute__ ((__packed__));
 
-enum btrfs_compression_type {
-	BTRFS_COMPRESS_NONE  = 0,
-	BTRFS_COMPRESS_ZLIB  = 1,
-	BTRFS_COMPRESS_LZO   = 2,
-	BTRFS_COMPRESS_TYPES = 2,
-	BTRFS_COMPRESS_LAST  = 3,
-};
-
 struct btrfs_inode_item {
 	/* nfs style generation number */
 	__le64 generation;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a998ef15ec6d..53525a54608b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,6 +50,7 @@
 #include "raid56.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "compression.h"
 
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index cdbadeaef202..cca21ff6553b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -4,6 +4,7 @@
 #include <linux/hardirq.h>
 #include "ctree.h"
 #include "extent_map.h"
+#include "compression.h"
 
 
 static struct kmem_cache *extent_map_cache;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1c50a7b09b4e..763fd170ff16 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -25,6 +25,7 @@
 #include "transaction.h"
 #include "volumes.h"
 #include "print-tree.h"
+#include "compression.h"
 
 #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d23d10024cf0..dba5de6cdc8a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -41,6 +41,7 @@
 #include "locking.h"
 #include "volumes.h"
 #include "qgroup.h"
+#include "compression.h"
 
 static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7a1ffacd314c..053e677839fe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -60,6 +60,7 @@
 #include "sysfs.h"
 #include "qgroup.h"
 #include "tree-log.h"
+#include "compression.h"
 
 #ifdef CONFIG_64BIT
 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 988eb1513aa5..86e1688c13b2 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -25,6 +25,7 @@
 #include "btrfs_inode.h"
 #include "extent_io.h"
 #include "disk-io.h"
+#include "compression.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index f9e60231f685..36992128c746 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -22,6 +22,7 @@
 #include "hash.h"
 #include "transaction.h"
 #include "xattr.h"
+#include "compression.h"
 
 #define BTRFS_PROP_HANDLERS_HT_BITS 8
 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d2e29925f1da..19b7bf4284ee 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -34,6 +34,7 @@
 #include "disk-io.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
+#include "compression.h"
 
 static int g_verbose = 0;
 
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index e2d3da02deee..863a6a3af1f8 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -22,6 +22,7 @@
 #include "../disk-io.h"
 #include "../extent_io.h"
 #include "../volumes.h"
+#include "../compression.h"
 
 static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
 			  u64 ram_bytes, u64 offset, u64 disk_bytenr,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9d2e8ecc5382..eafdb965c206 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -26,6 +26,7 @@
 #include "print-tree.h"
 #include "backref.h"
 #include "hash.h"
+#include "compression.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
-- 
cgit v1.2.3


From bb7ab3b92e46da06b580c6f83abe7894dc449cca Mon Sep 17 00:00:00 2001
From: Adam Buchbinder <adam.buchbinder@gmail.com>
Date: Fri, 4 Mar 2016 11:23:12 -0800
Subject: btrfs: Fix misspellings in comments.

Signed-off-by: Adam Buchbinder <adam.buchbinder@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/check-integrity.c |  2 +-
 fs/btrfs/ctree.h           | 10 +++++-----
 fs/btrfs/dev-replace.c     |  2 +-
 fs/btrfs/disk-io.c         |  2 +-
 fs/btrfs/extent-tree.c     |  2 +-
 fs/btrfs/extent_map.c      |  4 ++--
 fs/btrfs/file.c            |  2 +-
 fs/btrfs/ordered-data.c    |  2 +-
 fs/btrfs/tree-log.c        |  2 +-
 fs/btrfs/volumes.c         |  4 ++--
 10 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'fs/btrfs/tree-log.c')

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 9d226b5d6358..e34a71b3e225 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -178,7 +178,7 @@ struct btrfsic_block {
  * Elements of this type are allocated dynamically and required because
  * each block object can refer to and can be ref from multiple blocks.
  * The key to lookup them in the hashtable is the dev_bytenr of
- * the block ref to plus the one from the block refered from.
+ * the block ref to plus the one from the block referred from.
  * The fact that they are searchable via a hashtable and that a
  * ref_cnt is maintained is not required for the btrfs integrity
  * check algorithm itself, it is only used to make the output more
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0ef28fd2bd27..84a6a5b3384a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -788,7 +788,7 @@ struct btrfs_root_item {
 
 	/*
 	 * This generation number is used to test if the new fields are valid
-	 * and up to date while reading the root item. Everytime the root item
+	 * and up to date while reading the root item. Every time the root item
 	 * is written out, the "generation" field is copied into this field. If
 	 * anyone ever mounted the fs with an older kernel, we will have
 	 * mismatching generation values here and thus must invalidate the
@@ -1219,10 +1219,10 @@ struct btrfs_space_info {
 	 * we've called update_block_group and dropped the bytes_used counter
 	 * and increased the bytes_pinned counter.  However this means that
 	 * bytes_pinned does not reflect the bytes that will be pinned once the
-	 * delayed refs are flushed, so this counter is inc'ed everytime we call
-	 * btrfs_free_extent so it is a realtime count of what will be freed
-	 * once the transaction is committed.  It will be zero'ed everytime the
-	 * transaction commits.
+	 * delayed refs are flushed, so this counter is inc'ed every time we
+	 * call btrfs_free_extent so it is a realtime count of what will be
+	 * freed once the transaction is committed.  It will be zero'ed every
+	 * time the transaction commits.
 	 */
 	struct percpu_counter total_bytes_pinned;
 
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ff2db7a6c894..a1d6652e0c47 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -858,7 +858,7 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
 		 * not called and the the filesystem is remounted
 		 * in degraded state. This does not stop the
 		 * dev_replace procedure. It needs to be canceled
-		 * manually if the cancelation is wanted.
+		 * manually if the cancellation is wanted.
 		 */
 		break;
 	}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 53525a54608b..c95e3ce9f22e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -816,7 +816,7 @@ static void run_one_async_done(struct btrfs_work *work)
 	    waitqueue_active(&fs_info->async_submit_wait))
 		wake_up(&fs_info->async_submit_wait);
 
-	/* If an error occured we just want to clean up the bio and move on */
+	/* If an error occurred we just want to clean up the bio and move on */
 	if (async->error) {
 		async->bio->bi_error = async->error;
 		bio_endio(async->bio);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 083783b53536..53e12977bfd0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5758,7 +5758,7 @@ out_fail:
 
 		/*
 		 * This is tricky, but first we need to figure out how much we
-		 * free'd from any free-ers that occured during this
+		 * free'd from any free-ers that occurred during this
 		 * reservation, so we reset ->csum_bytes to the csum_bytes
 		 * before we dropped our lock, and then call the free for the
 		 * number of bytes that were freed while we were trying our
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index cca21ff6553b..318b048eb254 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -62,7 +62,7 @@ struct extent_map *alloc_extent_map(void)
 
 /**
  * free_extent_map - drop reference count of an extent_map
- * @em:		extent map beeing releasead
+ * @em:		extent map being releasead
  *
  * Drops the reference out on @em by one and free the structure
  * if the reference count hits zero.
@@ -422,7 +422,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 /**
  * remove_extent_mapping - removes an extent_map from the extent tree
  * @tree:	extent tree to remove from
- * @em:		extent map beeing removed
+ * @em:		extent map being removed
  *
  * Removes @em from @tree.  No reference counts are dropped, and no checks
  * are done to see if the range is in use
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dba5de6cdc8a..15a09cb156ce 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1847,7 +1847,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	/*
 	 * We also have to set last_sub_trans to the current log transid,
 	 * otherwise subsequent syncs to a file that's been synced in this
-	 * transaction will appear to have already occured.
+	 * transaction will appear to have already occurred.
 	 */
 	spin_lock(&BTRFS_I(inode)->lock);
 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 86e1688c13b2..0de7da5a610d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1010,7 +1010,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 	for (; node; node = rb_prev(node)) {
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 
-		/* We treat this entry as if it doesnt exist */
+		/* We treat this entry as if it doesn't exist */
 		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
 			continue;
 		if (test->file_offset + test->len <= disk_i_size)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index eafdb965c206..24d03c751149 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1046,7 +1046,7 @@ again:
 
 		/*
 		 * NOTE: we have searched root tree and checked the
-		 * coresponding ref, it does not need to check again.
+		 * corresponding ref, it does not need to check again.
 		 */
 		*search_done = 1;
 	}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 80857b4646c0..e2b54d546b7c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2749,7 +2749,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 	    em->start + em->len < chunk_offset) {
 		/*
 		 * This is a logic error, but we don't want to just rely on the
-		 * user having built with ASSERT enabled, so if ASSERT doens't
+		 * user having built with ASSERT enabled, so if ASSERT doesn't
 		 * do anything we still error out.
 		 */
 		ASSERT(0);
@@ -4119,7 +4119,7 @@ out:
  * Callback for btrfs_uuid_tree_iterate().
  * returns:
  * 0	check succeeded, the entry is not outdated.
- * < 0	if an error occured.
+ * < 0	if an error occurred.
  * > 0	if the check failed, which means the caller shall remove the entry.
  */
 static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
-- 
cgit v1.2.3